Index: vendor/llvm/dist/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- vendor/llvm/dist/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 327151) +++ vendor/llvm/dist/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 327152) @@ -1,17727 +1,17728 @@ //===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run // both before and after the DAG is legalized. // // This pass is not a substitute for the LLVM IR instcombine pass. This pass is // primarily intended to handle simplification opportunities that are implicit // in the LLVM IR and exposed by the various codegen lowering phases. // //===----------------------------------------------------------------------===// #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "dagcombine" STATISTIC(NodesCombined , "Number of dag nodes combined"); STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created"); STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created"); STATISTIC(OpsNarrowed , "Number of load/op/store narrowed"); STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int"); STATISTIC(SlicedLoads, "Number of load sliced"); static cl::opt CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, cl::desc("Enable DAG combiner's use of IR alias analysis")); static cl::opt UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), cl::desc("Enable DAG combiner's use of TBAA")); #ifndef NDEBUG static cl::opt CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, cl::desc("Only use DAG-combiner alias analysis in this" " function")); #endif /// Hidden option to stress test load slicing, i.e., when this option /// is enabled, load slicing bypasses most of its profitability guards. static cl::opt StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, cl::desc("Bypass the profitability model of load slicing"), cl::init(false)); static cl::opt MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), cl::desc("DAG combiner may split indexing from loads")); namespace { class DAGCombiner { SelectionDAG &DAG; const TargetLowering &TLI; CombineLevel Level; CodeGenOpt::Level OptLevel; bool LegalOperations = false; bool LegalTypes = false; bool ForCodeSize; /// \brief Worklist of all of the nodes that need to be simplified. /// /// This must behave as a stack -- new nodes to process are pushed onto the /// back and when processing we pop off of the back. /// /// The worklist will not contain duplicates but may contain null entries /// due to nodes being deleted from the underlying DAG. SmallVector Worklist; /// \brief Mapping from an SDNode to its position on the worklist. /// /// This is used to find and remove nodes from the worklist (by nulling /// them) when they are deleted from the underlying DAG. It relies on /// stable indices of nodes within the worklist. DenseMap WorklistMap; /// \brief Set of nodes which have been combined (at least once). /// /// This is used to allow us to reliably add any operands of a DAG node /// which have not yet been combined to the worklist. SmallPtrSet CombinedNodes; // AA - Used for DAG load/store alias analysis. AliasAnalysis *AA; /// When an instruction is simplified, add all users of the instruction to /// the work lists because they might get more simplified now. void AddUsersToWorklist(SDNode *N) { for (SDNode *Node : N->uses()) AddToWorklist(Node); } /// Call the node-specific routine that folds each particular type of node. SDValue visit(SDNode *N); public: DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) { ForCodeSize = DAG.getMachineFunction().getFunction().optForSize(); MaximumLegalStoreInBits = 0; for (MVT VT : MVT::all_valuetypes()) if (EVT(VT).isSimple() && VT != MVT::Other && TLI.isTypeLegal(EVT(VT)) && VT.getSizeInBits() >= MaximumLegalStoreInBits) MaximumLegalStoreInBits = VT.getSizeInBits(); } /// Add to the worklist making sure its instance is at the back (next to be /// processed.) void AddToWorklist(SDNode *N) { assert(N->getOpcode() != ISD::DELETED_NODE && "Deleted Node added to Worklist"); // Skip handle nodes as they can't usefully be combined and confuse the // zero-use deletion strategy. if (N->getOpcode() == ISD::HANDLENODE) return; if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) Worklist.push_back(N); } /// Remove all instances of N from the worklist. void removeFromWorklist(SDNode *N) { CombinedNodes.erase(N); auto It = WorklistMap.find(N); if (It == WorklistMap.end()) return; // Not in the worklist. // Null out the entry rather than erasing it to avoid a linear operation. Worklist[It->second] = nullptr; WorklistMap.erase(It); } void deleteAndRecombine(SDNode *N); bool recursivelyDeleteUnusedNodes(SDNode *N); /// Replaces all uses of the results of one DAG node with new values. SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo = true); /// Replaces all uses of the results of one DAG node with new values. SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) { return CombineTo(N, &Res, 1, AddTo); } /// Replaces all uses of the results of one DAG node with new values. SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true) { SDValue To[] = { Res0, Res1 }; return CombineTo(N, To, 2, AddTo); } void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); private: unsigned MaximumLegalStoreInBits; /// Check the specified integer node value to see if it can be simplified or /// if things it uses can be simplified by bit propagation. /// If so, return true. bool SimplifyDemandedBits(SDValue Op) { unsigned BitWidth = Op.getScalarValueSizeInBits(); APInt Demanded = APInt::getAllOnesValue(BitWidth); return SimplifyDemandedBits(Op, Demanded); } bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded); bool CombineToPreIndexedLoadStore(SDNode *N); bool CombineToPostIndexedLoadStore(SDNode *N); SDValue SplitIndexingFromLoad(LoadSDNode *LD); bool SliceUpLoad(SDNode *N); /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed /// load. /// /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced. /// \param InVecVT type of the input vector to EVE with bitcasts resolved. /// \param EltNo index of the vector element to load. /// \param OriginalLoad load that EVE came from to be replaced. /// \returns EVE on success SDValue() on failure. SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad( SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad); void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); SDValue SExtPromoteOperand(SDValue Op, EVT PVT); SDValue ZExtPromoteOperand(SDValue Op, EVT PVT); SDValue PromoteIntBinOp(SDValue Op); SDValue PromoteIntShiftOp(SDValue Op); SDValue PromoteExtend(SDValue Op); bool PromoteLoad(SDValue Op); void ExtendSetCCUses(const SmallVectorImpl &SetCCs, SDValue Trunc, SDValue ExtLoad, const SDLoc &DL, ISD::NodeType ExtType); /// Call the node-specific routine that knows how to fold each /// particular type of node. If that doesn't do anything, try the /// target-specific DAG combines. SDValue combine(SDNode *N); // Visitation implementation - Implement dag node combining for different // node types. The semantics are as follows: // Return Value: // SDValue.getNode() == 0 - No change was made // SDValue.getNode() == N - N was replaced, is dead and has been handled. // otherwise - N should be replaced by the returned Operand. // SDValue visitTokenFactor(SDNode *N); SDValue visitMERGE_VALUES(SDNode *N); SDValue visitADD(SDNode *N); SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference); SDValue visitSUB(SDNode *N); SDValue visitADDC(SDNode *N); SDValue visitUADDO(SDNode *N); SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N); SDValue visitSUBC(SDNode *N); SDValue visitUSUBO(SDNode *N); SDValue visitADDE(SDNode *N); SDValue visitADDCARRY(SDNode *N); SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); SDValue visitSUBE(SDNode *N); SDValue visitSUBCARRY(SDNode *N); SDValue visitMUL(SDNode *N); SDValue useDivRem(SDNode *N); SDValue visitSDIV(SDNode *N); SDValue visitUDIV(SDNode *N); SDValue visitREM(SDNode *N); SDValue visitMULHU(SDNode *N); SDValue visitMULHS(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); SDValue visitSMULO(SDNode *N); SDValue visitUMULO(SDNode *N); SDValue visitIMINMAX(SDNode *N); SDValue visitAND(SDNode *N); SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *LocReference); SDValue visitOR(SDNode *N); SDValue visitORLike(SDValue N0, SDValue N1, SDNode *LocReference); SDValue visitXOR(SDNode *N); SDValue SimplifyVBinOp(SDNode *N); SDValue visitSHL(SDNode *N); SDValue visitSRA(SDNode *N); SDValue visitSRL(SDNode *N); SDValue visitRotate(SDNode *N); SDValue visitABS(SDNode *N); SDValue visitBSWAP(SDNode *N); SDValue visitBITREVERSE(SDNode *N); SDValue visitCTLZ(SDNode *N); SDValue visitCTLZ_ZERO_UNDEF(SDNode *N); SDValue visitCTTZ(SDNode *N); SDValue visitCTTZ_ZERO_UNDEF(SDNode *N); SDValue visitCTPOP(SDNode *N); SDValue visitSELECT(SDNode *N); SDValue visitVSELECT(SDNode *N); SDValue visitSELECT_CC(SDNode *N); SDValue visitSETCC(SDNode *N); SDValue visitSETCCE(SDNode *N); SDValue visitSETCCCARRY(SDNode *N); SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); SDValue visitAssertExt(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitTRUNCATE(SDNode *N); SDValue visitBITCAST(SDNode *N); SDValue visitBUILD_PAIR(SDNode *N); SDValue visitFADD(SDNode *N); SDValue visitFSUB(SDNode *N); SDValue visitFMUL(SDNode *N); SDValue visitFMA(SDNode *N); SDValue visitFDIV(SDNode *N); SDValue visitFREM(SDNode *N); SDValue visitFSQRT(SDNode *N); SDValue visitFCOPYSIGN(SDNode *N); SDValue visitSINT_TO_FP(SDNode *N); SDValue visitUINT_TO_FP(SDNode *N); SDValue visitFP_TO_SINT(SDNode *N); SDValue visitFP_TO_UINT(SDNode *N); SDValue visitFP_ROUND(SDNode *N); SDValue visitFP_ROUND_INREG(SDNode *N); SDValue visitFP_EXTEND(SDNode *N); SDValue visitFNEG(SDNode *N); SDValue visitFABS(SDNode *N); SDValue visitFCEIL(SDNode *N); SDValue visitFTRUNC(SDNode *N); SDValue visitFFLOOR(SDNode *N); SDValue visitFMINNUM(SDNode *N); SDValue visitFMAXNUM(SDNode *N); SDValue visitBRCOND(SDNode *N); SDValue visitBR_CC(SDNode *N); SDValue visitLOAD(SDNode *N); SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain); SDValue replaceStoreOfFPConstant(StoreSDNode *ST); SDValue visitSTORE(SDNode *N); SDValue visitINSERT_VECTOR_ELT(SDNode *N); SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); SDValue visitBUILD_VECTOR(SDNode *N); SDValue visitCONCAT_VECTORS(SDNode *N); SDValue visitEXTRACT_SUBVECTOR(SDNode *N); SDValue visitVECTOR_SHUFFLE(SDNode *N); SDValue visitSCALAR_TO_VECTOR(SDNode *N); SDValue visitINSERT_SUBVECTOR(SDNode *N); SDValue visitMLOAD(SDNode *N); SDValue visitMSTORE(SDNode *N); SDValue visitMGATHER(SDNode *N); SDValue visitMSCATTER(SDNode *N); SDValue visitFP_TO_FP16(SDNode *N); SDValue visitFP16_TO_FP(SDNode *N); SDValue visitFADDForFMACombine(SDNode *N); SDValue visitFSUBForFMACombine(SDNode *N); SDValue visitFMULForFMADistributiveCombine(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue LHS, SDValue RHS); SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); SDValue foldSelectOfConstants(SDNode *N); SDValue foldVSelectOfConstants(SDNode *N); SDValue foldBinOpIntoSelect(SDNode *BO); bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N); SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC, bool NotExtCompare = false); SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, const SDLoc &DL); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans = true); bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, SDValue &CC) const; bool isOneUseSetCC(SDValue N) const; SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp); SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); SDValue CombineExtLoad(SDNode *N); SDValue combineRepeatedFPDivisors(SDNode *N); SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); SDValue BuildLogBase2(SDValue Op, const SDLoc &DL); SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip); SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations, SDNodeFlags Flags, bool Reciprocal); SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations, SDNodeFlags Flags, bool Reciprocal); SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, SDValue InnerPos, SDValue InnerNeg, unsigned PosOpcode, unsigned NegOpcode, const SDLoc &DL); SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); SDValue reduceBuildVecToShuffle(SDNode *N); SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, SDValue VecIn2, unsigned LeftIdx); SDValue matchVSelectOpSizesWithSetCC(SDNode *N); /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void GatherAllAliases(SDNode *N, SDValue OriginalChain, SmallVectorImpl &Aliases); /// Return true if there is any possibility that the two addresses overlap. bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const; /// Walk up chain skipping non-aliasing memory nodes, looking for a better /// chain (aliasing node.) SDValue FindBetterChain(SDNode *N, SDValue Chain); /// Try to replace a store and any possibly adjacent stores on /// consecutive chains with better chains. Return true only if St is /// replaced. /// /// Notice that other chains may still be replaced even if the function /// returns false. bool findBetterNeighborChains(StoreSDNode *St); /// Match "(X shl/srl V1) & V2" where V2 may not be present. bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask); /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { // Ptr to the mem node. LSBaseSDNode *MemNode; // Offset from the base ptr. int64_t OffsetFromBase; MemOpLink(LSBaseSDNode *N, int64_t Offset) : MemNode(N), OffsetFromBase(Offset) {} }; /// This is a helper function for visitMUL to check the profitability /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). /// MulNode is the original multiply, AddNode is (add x, c1), /// and ConstNode is c2. bool isMulAddWithConstProfitable(SDNode *MulNode, SDValue &AddNode, SDValue &ConstNode); /// This is a helper function for visitAND and visitZERO_EXTEND. Returns /// true if the (and (load x) c) pattern matches an extload. ExtVT returns /// the type of the loaded value to be extended. bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, EVT LoadResultTy, EVT &ExtVT); /// Helper function to calculate whether the given Load can have its /// width reduced to ExtVT. bool isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType, EVT &ExtVT, unsigned ShAmt = 0); /// Used by BackwardsPropagateMask to find suitable loads. bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl &Loads, SmallPtrSetImpl &NodeWithConsts, ConstantSDNode *Mask, SDNode *&UncombinedNode); /// Attempt to propagate a given AND node back to load leaves so that they /// can be combined into narrow loads. bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG); /// Helper function for MergeConsecutiveStores which merges the /// component store chains. SDValue getMergeStoreChains(SmallVectorImpl &StoreNodes, unsigned NumStores); /// This is a helper function for MergeConsecutiveStores. When the /// source elements of the consecutive stores are all constants or /// all extracted vector elements, try to merge them into one /// larger store introducing bitcasts if necessary. \return True /// if a merged store was created. bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl &StoreNodes, EVT MemVT, unsigned NumStores, bool IsConstantSrc, bool UseVector, bool UseTrunc); /// This is a helper function for MergeConsecutiveStores. Stores /// that potentially may be merged with St are placed in /// StoreNodes. void getStoreMergeCandidates(StoreSDNode *St, SmallVectorImpl &StoreNodes); /// Helper function for MergeConsecutiveStores. Checks if /// candidate stores have indirect dependency through their /// operands. \return True if safe to merge. bool checkMergeStoreCandidatesForDependencies( SmallVectorImpl &StoreNodes, unsigned NumStores); /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return number of stores that were merged into a merged store (the /// affected nodes are stored as a prefix in \p StoreNodes). bool MergeConsecutiveStores(StoreSDNode *N); /// \brief Try to transform a truncation where C is a constant: /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) /// /// \p N needs to be a truncation and its first operand an AND. Other /// requirements are checked by the function (e.g. that trunc is /// single-use) and if missed an empty SDValue is returned. SDValue distributeTruncateThroughAnd(SDNode *N); public: /// Runs the dag combiner on all nodes in the work list void Run(CombineLevel AtLevel); SelectionDAG &getDAG() const { return DAG; } /// Returns a type large enough to hold any valid shift amount - before type /// legalization these can be huge. EVT getShiftAmountTy(EVT LHSTy) { assert(LHSTy.isInteger() && "Shift amount is not an integer type!"); if (LHSTy.isVector()) return LHSTy; auto &DL = DAG.getDataLayout(); return LegalTypes ? TLI.getScalarShiftAmountTy(DL, LHSTy) : TLI.getPointerTy(DL); } /// This method returns true if we are running before type legalization or /// if the specified VT is legal. bool isTypeLegal(const EVT &VT) { if (!LegalTypes) return true; return TLI.isTypeLegal(VT); } /// Convenience wrapper around TargetLowering::getSetCCResultType EVT getSetCCResultType(EVT VT) const { return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); } }; /// This class is a DAGUpdateListener that removes any deleted /// nodes from the worklist. class WorklistRemover : public SelectionDAG::DAGUpdateListener { DAGCombiner &DC; public: explicit WorklistRemover(DAGCombiner &dc) : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} void NodeDeleted(SDNode *N, SDNode *E) override { DC.removeFromWorklist(N); } }; } // end anonymous namespace //===----------------------------------------------------------------------===// // TargetLowering::DAGCombinerInfo implementation //===----------------------------------------------------------------------===// void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) { ((DAGCombiner*)DC)->AddToWorklist(N); } SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, ArrayRef To, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo); } SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, SDValue Res, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo); } SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); } void TargetLowering::DAGCombinerInfo:: CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO); } //===----------------------------------------------------------------------===// // Helper Functions //===----------------------------------------------------------------------===// void DAGCombiner::deleteAndRecombine(SDNode *N) { removeFromWorklist(N); // If the operands of this node are only used by the node, they will now be // dead. Make sure to re-visit them and recursively delete dead nodes. for (const SDValue &Op : N->ops()) // For an operand generating multiple values, one of the values may // become dead allowing further simplification (e.g. split index // arithmetic from an indexed load). if (Op->hasOneUse() || Op->getNumValues() > 1) AddToWorklist(Op.getNode()); DAG.DeleteNode(N); } /// Return 1 if we can compute the negated form of the specified expression for /// the same cost as the expression itself, or 2 if we can compute the negated /// form more cheaply than the expression itself. static char isNegatibleForFree(SDValue Op, bool LegalOperations, const TargetLowering &TLI, const TargetOptions *Options, unsigned Depth = 0) { // fneg is removable even if it has multiple uses. if (Op.getOpcode() == ISD::FNEG) return 2; // Don't allow anything with multiple uses. if (!Op.hasOneUse()) return 0; // Don't recurse exponentially. if (Depth > 6) return 0; switch (Op.getOpcode()) { default: return false; case ISD::ConstantFP: { if (!LegalOperations) return 1; // Don't invert constant FP values after legalization unless the target says // the negated constant is legal. EVT VT = Op.getValueType(); return TLI.isOperationLegal(ISD::ConstantFP, VT) || TLI.isFPImmLegal(neg(cast(Op)->getValueAPF()), VT); } case ISD::FADD: // FIXME: determine better conditions for this xform. if (!Options->UnsafeFPMath) return 0; // After operation legalization, it might not be legal to create new FSUBs. if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType())) return 0; // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, Depth + 1)) return V; // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, Depth + 1); case ISD::FSUB: // We can't turn -(A-B) into B-A when we honor signed zeros. if (!Options->NoSignedZerosFPMath && !Op.getNode()->getFlags().hasNoSignedZeros()) return 0; // fold (fneg (fsub A, B)) -> (fsub B, A) return 1; case ISD::FMUL: case ISD::FDIV: if (Options->HonorSignDependentRoundingFPMath()) return 0; // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y)) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, Depth + 1)) return V; return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, Depth + 1); case ISD::FP_EXTEND: case ISD::FP_ROUND: case ISD::FSIN: return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, Depth + 1); } } /// If isNegatibleForFree returns true, return the newly negated expression. static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, unsigned Depth = 0) { const TargetOptions &Options = DAG.getTarget().Options; // fneg is removable even if it has multiple uses. if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0); // Don't allow anything with multiple uses. assert(Op.hasOneUse() && "Unknown reuse!"); assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree"); const SDNodeFlags Flags = Op.getNode()->getFlags(); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown code"); case ISD::ConstantFP: { APFloat V = cast(Op)->getValueAPF(); V.changeSign(); return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType()); } case ISD::FADD: // FIXME: determine better conditions for this xform. assert(Options.UnsafeFPMath); // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, DAG.getTargetLoweringInfo(), &Options, Depth+1)) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), Op.getOperand(1), Flags); // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(1), DAG, LegalOperations, Depth+1), Op.getOperand(0), Flags); case ISD::FSUB: // fold (fneg (fsub 0, B)) -> B if (ConstantFPSDNode *N0CFP = dyn_cast(Op.getOperand(0))) if (N0CFP->isZero()) return Op.getOperand(1); // fold (fneg (fsub A, B)) -> (fsub B, A) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(0), Flags); case ISD::FMUL: case ISD::FDIV: assert(!Options.HonorSignDependentRoundingFPMath()); // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, DAG.getTargetLoweringInfo(), &Options, Depth+1)) return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), Op.getOperand(1), Flags); // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y)) return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0), GetNegatedExpression(Op.getOperand(1), DAG, LegalOperations, Depth+1), Flags); case ISD::FP_EXTEND: case ISD::FSIN: return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1)); case ISD::FP_ROUND: return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), Op.getOperand(1)); } } // APInts must be the same size for most operations, this helper // function zero extends the shorter of the pair so that they match. // We provide an Offset so that we can create bitwidths that won't overflow. static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) { unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth()); LHS = LHS.zextOrSelf(Bits); RHS = RHS.zextOrSelf(Bits); } // Return true if this node is a setcc, or is a select_cc // that selects between the target values used for true and false, making it // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to // the appropriate nodes based on the type of node we are checking. This // simplifies life a bit for the callers. bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, SDValue &CC) const { if (N.getOpcode() == ISD::SETCC) { LHS = N.getOperand(0); RHS = N.getOperand(1); CC = N.getOperand(2); return true; } if (N.getOpcode() != ISD::SELECT_CC || !TLI.isConstTrueVal(N.getOperand(2).getNode()) || !TLI.isConstFalseVal(N.getOperand(3).getNode())) return false; if (TLI.getBooleanContents(N.getValueType()) == TargetLowering::UndefinedBooleanContent) return false; LHS = N.getOperand(0); RHS = N.getOperand(1); CC = N.getOperand(4); return true; } /// Return true if this is a SetCC-equivalent operation with only one use. /// If this is true, it allows the users to invert the operation for free when /// it is profitable to do so. bool DAGCombiner::isOneUseSetCC(SDValue N) const { SDValue N0, N1, N2; if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse()) return true; return false; } // \brief Returns the SDNode if it is a constant float BuildVector // or constant float. static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) { if (isa(N)) return N.getNode(); if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode())) return N.getNode(); return nullptr; } // Determines if it is a constant integer or a build vector of constant // integers (and undefs). // Do not permit build vector implicit truncation. static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) { if (ConstantSDNode *Const = dyn_cast(N)) return !(Const->isOpaque() && NoOpaques); if (N.getOpcode() != ISD::BUILD_VECTOR) return false; unsigned BitWidth = N.getScalarValueSizeInBits(); for (const SDValue &Op : N->op_values()) { if (Op.isUndef()) continue; ConstantSDNode *Const = dyn_cast(Op); if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth || (Const->isOpaque() && NoOpaques)) return false; } return true; } // Determines if it is a constant null integer or a splatted vector of a // constant null integer (with no undefs). // Build vector implicit truncation is not an issue for null values. static bool isNullConstantOrNullSplatConstant(SDValue N) { if (ConstantSDNode *Splat = isConstOrConstSplat(N)) return Splat->isNullValue(); return false; } // Determines if it is a constant integer of one or a splatted vector of a // constant integer of one (with no undefs). // Do not permit build vector implicit truncation. static bool isOneConstantOrOneSplatConstant(SDValue N) { unsigned BitWidth = N.getScalarValueSizeInBits(); if (ConstantSDNode *Splat = isConstOrConstSplat(N)) return Splat->isOne() && Splat->getAPIntValue().getBitWidth() == BitWidth; return false; } // Determines if it is a constant integer of all ones or a splatted vector of a // constant integer of all ones (with no undefs). // Do not permit build vector implicit truncation. static bool isAllOnesConstantOrAllOnesSplatConstant(SDValue N) { unsigned BitWidth = N.getScalarValueSizeInBits(); if (ConstantSDNode *Splat = isConstOrConstSplat(N)) return Splat->isAllOnesValue() && Splat->getAPIntValue().getBitWidth() == BitWidth; return false; } // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with // undef's. static bool isAnyConstantBuildVector(const SDNode *N) { return ISD::isBuildVectorOfConstantSDNodes(N) || ISD::isBuildVectorOfConstantFPSDNodes(N); } // Attempt to match a unary predicate against a scalar/splat constant or // every element of a constant BUILD_VECTOR. static bool matchUnaryPredicate(SDValue Op, std::function Match) { if (auto *Cst = dyn_cast(Op)) return Match(Cst); if (ISD::BUILD_VECTOR != Op.getOpcode()) return false; EVT SVT = Op.getValueType().getScalarType(); for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { auto *Cst = dyn_cast(Op.getOperand(i)); if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst)) return false; } return true; } // Attempt to match a binary predicate against a pair of scalar/splat constants // or every element of a pair of constant BUILD_VECTORs. static bool matchBinaryPredicate( SDValue LHS, SDValue RHS, std::function Match) { if (LHS.getValueType() != RHS.getValueType()) return false; if (auto *LHSCst = dyn_cast(LHS)) if (auto *RHSCst = dyn_cast(RHS)) return Match(LHSCst, RHSCst); if (ISD::BUILD_VECTOR != LHS.getOpcode() || ISD::BUILD_VECTOR != RHS.getOpcode()) return false; EVT SVT = LHS.getValueType().getScalarType(); for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) { auto *LHSCst = dyn_cast(LHS.getOperand(i)); auto *RHSCst = dyn_cast(RHS.getOperand(i)); if (!LHSCst || !RHSCst) return false; if (LHSCst->getValueType(0) != SVT || LHSCst->getValueType(0) != RHSCst->getValueType(0)) return false; if (!Match(LHSCst, RHSCst)) return false; } return true; } SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1) { EVT VT = N0.getValueType(); if (N0.getOpcode() == Opc) { if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2)) if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R)) return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); return SDValue(); } if (N0.hasOneUse()) { // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one // use SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); if (!OpNode.getNode()) return SDValue(); AddToWorklist(OpNode.getNode()); return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); } } } if (N1.getOpcode() == Opc) { if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) { if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) { // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2)) if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L)) return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode); return SDValue(); } if (N1.hasOneUse()) { // reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one // use SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0)); if (!OpNode.getNode()) return SDValue(); AddToWorklist(OpNode.getNode()); return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1)); } } } return SDValue(); } SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo) { assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); ++NodesCombined; DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: "; To[0].getNode()->dump(&DAG); dbgs() << " and " << NumTo-1 << " other values\n"); for (unsigned i = 0, e = NumTo; i != e; ++i) assert((!To[i].getNode() || N->getValueType(i) == To[i].getValueType()) && "Cannot combine value to value of different type!"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesWith(N, To); if (AddTo) { // Push the new nodes and any users onto the worklist for (unsigned i = 0, e = NumTo; i != e; ++i) { if (To[i].getNode()) { AddToWorklist(To[i].getNode()); AddUsersToWorklist(To[i].getNode()); } } } // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. if (N->use_empty()) deleteAndRecombine(N); return SDValue(N, 0); } void DAGCombiner:: CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { // Replace all uses. If any nodes become isomorphic to other nodes and // are deleted, make sure to remove them from our worklist. WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New); // Push the new node and any (possibly new) users onto the worklist. AddToWorklist(TLO.New.getNode()); AddUsersToWorklist(TLO.New.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. if (TLO.Old.getNode()->use_empty()) deleteAndRecombine(TLO.Old.getNode()); } /// Check the specified integer node value to see if it can be simplified or if /// things it uses can be simplified by bit propagation. If so, return true. bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); KnownBits Known; if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO)) return false; // Revisit the node. AddToWorklist(Op.getNode()); // Replace the old value with the new one. ++NodesCombined; DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG); dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); dbgs() << '\n'); CommitTargetLoweringOpt(TLO); return true; } void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) { SDLoc DL(Load); EVT VT = Load->getValueType(0); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0)); DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: "; Trunc.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1)); deleteAndRecombine(Load); AddToWorklist(Trunc.getNode()); } SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) { Replace = false; SDLoc DL(Op); if (ISD::isUNINDEXEDLoad(Op.getNode())) { LoadSDNode *LD = cast(Op); EVT MemVT = LD->getMemoryVT(); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD) : LD->getExtensionType(); Replace = true; return DAG.getExtLoad(ExtType, DL, PVT, LD->getChain(), LD->getBasePtr(), MemVT, LD->getMemOperand()); } unsigned Opc = Op.getOpcode(); switch (Opc) { default: break; case ISD::AssertSext: if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT)) return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1)); break; case ISD::AssertZext: if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT)) return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1)); break; case ISD::Constant: { unsigned ExtOpc = Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, DL, PVT, Op); } } if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT)) return SDValue(); return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op); } SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) { if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT)) return SDValue(); EVT OldVT = Op.getValueType(); SDLoc DL(Op); bool Replace = false; SDValue NewOp = PromoteOperand(Op, PVT, Replace); if (!NewOp.getNode()) return SDValue(); AddToWorklist(NewOp.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp, DAG.getValueType(OldVT)); } SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) { EVT OldVT = Op.getValueType(); SDLoc DL(Op); bool Replace = false; SDValue NewOp = PromoteOperand(Op, PVT, Replace); if (!NewOp.getNode()) return SDValue(); AddToWorklist(NewOp.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); return DAG.getZeroExtendInReg(NewOp, DL, OldVT); } /// Promote the specified integer binary operation if the target indicates it is /// beneficial. e.g. On x86, it's usually better to promote i16 operations to /// i32 since i16 instructions are longer. SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { if (!LegalOperations) return SDValue(); EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return SDValue(); // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return SDValue(); EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); bool Replace0 = false; SDValue N0 = Op.getOperand(0); SDValue NN0 = PromoteOperand(N0, PVT, Replace0); bool Replace1 = false; SDValue N1 = Op.getOperand(1); SDValue NN1 = PromoteOperand(N1, PVT, Replace1); SDLoc DL(Op); SDValue RV = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1)); // We are always replacing N0/N1's use in N and only need // additional replacements if there are additional uses. Replace0 &= !N0->hasOneUse(); Replace1 &= (N0 != N1) && !N1->hasOneUse(); // Combine Op here so it is preserved past replacements. CombineTo(Op.getNode(), RV); // If operands have a use ordering, make sure we deal with // predecessor first. if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) { std::swap(N0, N1); std::swap(NN0, NN1); } if (Replace0) { AddToWorklist(NN0.getNode()); ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); } if (Replace1) { AddToWorklist(NN1.getNode()); ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); } return Op; } return SDValue(); } /// Promote the specified integer shift operation if the target indicates it is /// beneficial. e.g. On x86, it's usually better to promote i16 operations to /// i32 since i16 instructions are longer. SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { if (!LegalOperations) return SDValue(); EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return SDValue(); // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return SDValue(); EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); bool Replace = false; SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); if (Opc == ISD::SRA) N0 = SExtPromoteOperand(N0, PVT); else if (Opc == ISD::SRL) N0 = ZExtPromoteOperand(N0, PVT); else N0 = PromoteOperand(N0, PVT, Replace); if (!N0.getNode()) return SDValue(); SDLoc DL(Op); SDValue RV = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1)); AddToWorklist(N0.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); // Deal with Op being deleted. if (Op && Op.getOpcode() != ISD::DELETED_NODE) return RV; } return SDValue(); } SDValue DAGCombiner::PromoteExtend(SDValue Op) { if (!LegalOperations) return SDValue(); EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return SDValue(); // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return SDValue(); EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); // fold (aext (aext x)) -> (aext x) // fold (aext (zext x)) -> (zext x) // fold (aext (sext x)) -> (sext x) DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0)); } return SDValue(); } bool DAGCombiner::PromoteLoad(SDValue Op) { if (!LegalOperations) return false; if (!ISD::isUNINDEXEDLoad(Op.getNode())) return false; EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return false; // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return false; EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); SDLoc DL(Op); SDNode *N = Op.getNode(); LoadSDNode *LD = cast(N); EVT MemVT = LD->getMemoryVT(); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD) : LD->getExtensionType(); SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT, LD->getChain(), LD->getBasePtr(), MemVT, LD->getMemOperand()); SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); deleteAndRecombine(N); AddToWorklist(Result.getNode()); return true; } return false; } /// \brief Recursively delete a node which has no uses and any operands for /// which it is the only use. /// /// Note that this both deletes the nodes and removes them from the worklist. /// It also adds any nodes who have had a user deleted to the worklist as they /// may now have only one use and subject to other combines. bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) { if (!N->use_empty()) return false; SmallSetVector Nodes; Nodes.insert(N); do { N = Nodes.pop_back_val(); if (!N) continue; if (N->use_empty()) { for (const SDValue &ChildN : N->op_values()) Nodes.insert(ChildN.getNode()); removeFromWorklist(N); DAG.DeleteNode(N); } else { AddToWorklist(N); } } while (!Nodes.empty()); return true; } //===----------------------------------------------------------------------===// // Main DAG Combiner implementation //===----------------------------------------------------------------------===// void DAGCombiner::Run(CombineLevel AtLevel) { // set the instance variables, so that the various visit routines may use it. Level = AtLevel; LegalOperations = Level >= AfterLegalizeVectorOps; LegalTypes = Level >= AfterLegalizeTypes; // Add all the dag nodes to the worklist. for (SDNode &Node : DAG.allnodes()) AddToWorklist(&Node); // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted, and tracking any // changes of the root. HandleSDNode Dummy(DAG.getRoot()); // While the worklist isn't empty, find a node and try to combine it. while (!WorklistMap.empty()) { SDNode *N; // The Worklist holds the SDNodes in order, but it may contain null entries. do { N = Worklist.pop_back_val(); } while (!N); bool GoodWorklistEntry = WorklistMap.erase(N); (void)GoodWorklistEntry; assert(GoodWorklistEntry && "Found a worklist entry without a corresponding map entry!"); // If N has no uses, it is dead. Make sure to revisit all N's operands once // N is deleted from the DAG, since they too may now be dead or may have a // reduced number of uses, allowing other xforms. if (recursivelyDeleteUnusedNodes(N)) continue; WorklistRemover DeadNodes(*this); // If this combine is running after legalizing the DAG, re-legalize any // nodes pulled off the worklist. if (Level == AfterLegalizeDAG) { SmallSetVector UpdatedNodes; bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); for (SDNode *LN : UpdatedNodes) { AddToWorklist(LN); AddUsersToWorklist(LN); } if (!NIsValid) continue; } DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); // Add any operands of the new node which have not yet been combined to the // worklist as well. Because the worklist uniques things already, this // won't repeatedly process the same operand. CombinedNodes.insert(N); for (const SDValue &ChildN : N->op_values()) if (!CombinedNodes.count(ChildN.getNode())) AddToWorklist(ChildN.getNode()); SDValue RV = combine(N); if (!RV.getNode()) continue; ++NodesCombined; // If we get back the same node we passed in, rather than a new node or // zero, we know that the node must have defined multiple values and // CombineTo was used. Since CombineTo takes care of the worklist // mechanics for us, we have no work to do in this case. if (RV.getNode() == N) continue; assert(N->getOpcode() != ISD::DELETED_NODE && RV.getOpcode() != ISD::DELETED_NODE && "Node was deleted but visit returned new node!"); DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG)); if (N->getNumValues() == RV.getNode()->getNumValues()) DAG.ReplaceAllUsesWith(N, RV.getNode()); else { assert(N->getValueType(0) == RV.getValueType() && N->getNumValues() == 1 && "Type mismatch"); DAG.ReplaceAllUsesWith(N, &RV); } // Push the new node and any users onto the worklist AddToWorklist(RV.getNode()); AddUsersToWorklist(RV.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. This will also take care of adding any // operands which have lost a user to the worklist. recursivelyDeleteUnusedNodes(N); } // If the root changed (e.g. it was a dead load, update the root). DAG.setRoot(Dummy.getValue()); DAG.RemoveDeadNodes(); } SDValue DAGCombiner::visit(SDNode *N) { switch (N->getOpcode()) { default: break; case ISD::TokenFactor: return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD: return visitADD(N); case ISD::SUB: return visitSUB(N); case ISD::ADDC: return visitADDC(N); case ISD::UADDO: return visitUADDO(N); case ISD::SUBC: return visitSUBC(N); case ISD::USUBO: return visitUSUBO(N); case ISD::ADDE: return visitADDE(N); case ISD::ADDCARRY: return visitADDCARRY(N); case ISD::SUBE: return visitSUBE(N); case ISD::SUBCARRY: return visitSUBCARRY(N); case ISD::MUL: return visitMUL(N); case ISD::SDIV: return visitSDIV(N); case ISD::UDIV: return visitUDIV(N); case ISD::SREM: case ISD::UREM: return visitREM(N); case ISD::MULHU: return visitMULHU(N); case ISD::MULHS: return visitMULHS(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); case ISD::SMULO: return visitSMULO(N); case ISD::UMULO: return visitUMULO(N); case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: return visitIMINMAX(N); case ISD::AND: return visitAND(N); case ISD::OR: return visitOR(N); case ISD::XOR: return visitXOR(N); case ISD::SHL: return visitSHL(N); case ISD::SRA: return visitSRA(N); case ISD::SRL: return visitSRL(N); case ISD::ROTR: case ISD::ROTL: return visitRotate(N); case ISD::ABS: return visitABS(N); case ISD::BSWAP: return visitBSWAP(N); case ISD::BITREVERSE: return visitBITREVERSE(N); case ISD::CTLZ: return visitCTLZ(N); case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N); case ISD::CTTZ: return visitCTTZ(N); case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N); case ISD::CTPOP: return visitCTPOP(N); case ISD::SELECT: return visitSELECT(N); case ISD::VSELECT: return visitVSELECT(N); case ISD::SELECT_CC: return visitSELECT_CC(N); case ISD::SETCC: return visitSETCC(N); case ISD::SETCCE: return visitSETCCE(N); case ISD::SETCCCARRY: return visitSETCCCARRY(N); case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); case ISD::AssertSext: case ISD::AssertZext: return visitAssertExt(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); case ISD::TRUNCATE: return visitTRUNCATE(N); case ISD::BITCAST: return visitBITCAST(N); case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); case ISD::FADD: return visitFADD(N); case ISD::FSUB: return visitFSUB(N); case ISD::FMUL: return visitFMUL(N); case ISD::FMA: return visitFMA(N); case ISD::FDIV: return visitFDIV(N); case ISD::FREM: return visitFREM(N); case ISD::FSQRT: return visitFSQRT(N); case ISD::FCOPYSIGN: return visitFCOPYSIGN(N); case ISD::SINT_TO_FP: return visitSINT_TO_FP(N); case ISD::UINT_TO_FP: return visitUINT_TO_FP(N); case ISD::FP_TO_SINT: return visitFP_TO_SINT(N); case ISD::FP_TO_UINT: return visitFP_TO_UINT(N); case ISD::FP_ROUND: return visitFP_ROUND(N); case ISD::FP_ROUND_INREG: return visitFP_ROUND_INREG(N); case ISD::FP_EXTEND: return visitFP_EXTEND(N); case ISD::FNEG: return visitFNEG(N); case ISD::FABS: return visitFABS(N); case ISD::FFLOOR: return visitFFLOOR(N); case ISD::FMINNUM: return visitFMINNUM(N); case ISD::FMAXNUM: return visitFMAXNUM(N); case ISD::FCEIL: return visitFCEIL(N); case ISD::FTRUNC: return visitFTRUNC(N); case ISD::BRCOND: return visitBRCOND(N); case ISD::BR_CC: return visitBR_CC(N); case ISD::LOAD: return visitLOAD(N); case ISD::STORE: return visitSTORE(N); case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N); case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N); case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N); case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N); case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N); case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N); case ISD::MGATHER: return visitMGATHER(N); case ISD::MLOAD: return visitMLOAD(N); case ISD::MSCATTER: return visitMSCATTER(N); case ISD::MSTORE: return visitMSTORE(N); case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); } return SDValue(); } SDValue DAGCombiner::combine(SDNode *N) { SDValue RV = visit(N); // If nothing happened, try a target-specific DAG combine. if (!RV.getNode()) { assert(N->getOpcode() != ISD::DELETED_NODE && "Node was deleted but visit returned NULL!"); if (N->getOpcode() >= ISD::BUILTIN_OP_END || TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) { // Expose the DAG combiner to the target combiner impls. TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); RV = TLI.PerformDAGCombine(N, DagCombineInfo); } } // If nothing happened still, try promoting the operation. if (!RV.getNode()) { switch (N->getOpcode()) { default: break; case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: RV = PromoteIntBinOp(SDValue(N, 0)); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: RV = PromoteIntShiftOp(SDValue(N, 0)); break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: RV = PromoteExtend(SDValue(N, 0)); break; case ISD::LOAD: if (PromoteLoad(SDValue(N, 0))) RV = SDValue(N, 0); break; } } // If N is a commutative binary node, try eliminate it if the commuted // version is already present in the DAG. if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) && N->getNumValues() == 1) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Constant operands are canonicalized to RHS. if (N0 != N1 && (isa(N0) || !isa(N1))) { SDValue Ops[] = {N1, N0}; SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops, N->getFlags()); if (CSENode) return SDValue(CSENode, 0); } } return RV; } /// Given a node, return its input chain if it has one, otherwise return a null /// sd operand. static SDValue getInputChainForNode(SDNode *N) { if (unsigned NumOps = N->getNumOperands()) { if (N->getOperand(0).getValueType() == MVT::Other) return N->getOperand(0); if (N->getOperand(NumOps-1).getValueType() == MVT::Other) return N->getOperand(NumOps-1); for (unsigned i = 1; i < NumOps-1; ++i) if (N->getOperand(i).getValueType() == MVT::Other) return N->getOperand(i); } return SDValue(); } SDValue DAGCombiner::visitTokenFactor(SDNode *N) { // If N has two operands, where one has an input chain equal to the other, // the 'other' chain is redundant. if (N->getNumOperands() == 2) { if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1)) return N->getOperand(0); if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0)) return N->getOperand(1); } SmallVector TFs; // List of token factors to visit. SmallVector Ops; // Ops for replacing token factor. SmallPtrSet SeenOps; bool Changed = false; // If we should replace this token factor. // Start out with this token factor. TFs.push_back(N); // Iterate through token factors. The TFs grows when new token factors are // encountered. for (unsigned i = 0; i < TFs.size(); ++i) { SDNode *TF = TFs[i]; // Check each of the operands. for (const SDValue &Op : TF->op_values()) { switch (Op.getOpcode()) { case ISD::EntryToken: // Entry tokens don't need to be added to the list. They are // redundant. Changed = true; break; case ISD::TokenFactor: if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) { // Queue up for processing. TFs.push_back(Op.getNode()); // Clean up in case the token factor is removed. AddToWorklist(Op.getNode()); Changed = true; break; } LLVM_FALLTHROUGH; default: // Only add if it isn't already in the list. if (SeenOps.insert(Op.getNode()).second) Ops.push_back(Op); else Changed = true; break; } } } // Remove Nodes that are chained to another node in the list. Do so // by walking up chains breath-first stopping when we've seen // another operand. In general we must climb to the EntryNode, but we can exit // early if we find all remaining work is associated with just one operand as // no further pruning is possible. // List of nodes to search through and original Ops from which they originate. SmallVector, 8> Worklist; SmallVector OpWorkCount; // Count of work for each Op. SmallPtrSet SeenChains; bool DidPruneOps = false; unsigned NumLeftToConsider = 0; for (const SDValue &Op : Ops) { Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++)); OpWorkCount.push_back(1); } auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { // If this is an Op, we can remove the op from the list. Remark any // search associated with it as from the current OpNumber. if (SeenOps.count(Op) != 0) { Changed = true; DidPruneOps = true; unsigned OrigOpNumber = 0; while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op) OrigOpNumber++; assert((OrigOpNumber != Ops.size()) && "expected to find TokenFactor Operand"); // Re-mark worklist from OrigOpNumber to OpNumber for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) { if (Worklist[i].second == OrigOpNumber) { Worklist[i].second = OpNumber; } } OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber]; OpWorkCount[OrigOpNumber] = 0; NumLeftToConsider--; } // Add if it's a new chain if (SeenChains.insert(Op).second) { OpWorkCount[OpNumber]++; Worklist.push_back(std::make_pair(Op, OpNumber)); } }; for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) { // We need at least be consider at least 2 Ops to prune. if (NumLeftToConsider <= 1) break; auto CurNode = Worklist[i].first; auto CurOpNumber = Worklist[i].second; assert((OpWorkCount[CurOpNumber] > 0) && "Node should not appear in worklist"); switch (CurNode->getOpcode()) { case ISD::EntryToken: // Hitting EntryToken is the only way for the search to terminate without // hitting // another operand's search. Prevent us from marking this operand // considered. NumLeftToConsider++; break; case ISD::TokenFactor: for (const SDValue &Op : CurNode->op_values()) AddToWorklist(i, Op.getNode(), CurOpNumber); break; case ISD::CopyFromReg: case ISD::CopyToReg: AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); break; default: if (auto *MemNode = dyn_cast(CurNode)) AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber); break; } OpWorkCount[CurOpNumber]--; if (OpWorkCount[CurOpNumber] == 0) NumLeftToConsider--; } // If we've changed things around then replace token factor. if (Changed) { SDValue Result; if (Ops.empty()) { // The entry token is the only possible outcome. Result = DAG.getEntryNode(); } else { if (DidPruneOps) { SmallVector PrunedOps; // for (const SDValue &Op : Ops) { if (SeenChains.count(Op.getNode()) == 0) PrunedOps.push_back(Op); } Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps); } else { Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); } } return Result; } return SDValue(); } /// MERGE_VALUES can always be eliminated. SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { WorklistRemover DeadNodes(*this); // Replacing results may cause a different MERGE_VALUES to suddenly // be CSE'd with N, and carry its uses with it. Iterate until no // uses remain, to ensure that the node can be safely deleted. // First add the users of this node to the work list so that they // can be tried again once they have new operands. AddUsersToWorklist(N); do { for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i)); } while (!N->use_empty()); deleteAndRecombine(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } /// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a /// ConstantSDNode pointer else nullptr. static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { ConstantSDNode *Const = dyn_cast(N); return Const != nullptr && !Const->isOpaque() ? Const : nullptr; } SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { auto BinOpcode = BO->getOpcode(); assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB || BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV || BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM || BinOpcode == ISD::UREM || BinOpcode == ISD::AND || BinOpcode == ISD::OR || BinOpcode == ISD::XOR || BinOpcode == ISD::SHL || BinOpcode == ISD::SRL || BinOpcode == ISD::SRA || BinOpcode == ISD::FADD || BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL || BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) && "Unexpected binary operator"); // Bail out if any constants are opaque because we can't constant fold those. SDValue C1 = BO->getOperand(1); if (!isConstantOrConstantVector(C1, true) && !isConstantFPBuildVectorOrConstantFP(C1)) return SDValue(); // Don't do this unless the old select is going away. We want to eliminate the // binary operator, not replace a binop with a select. // TODO: Handle ISD::SELECT_CC. SDValue Sel = BO->getOperand(0); if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) return SDValue(); SDValue CT = Sel.getOperand(1); if (!isConstantOrConstantVector(CT, true) && !isConstantFPBuildVectorOrConstantFP(CT)) return SDValue(); SDValue CF = Sel.getOperand(2); if (!isConstantOrConstantVector(CF, true) && !isConstantFPBuildVectorOrConstantFP(CF)) return SDValue(); // We have a select-of-constants followed by a binary operator with a // constant. Eliminate the binop by pulling the constant math into the select. // Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1 EVT VT = Sel.getValueType(); SDLoc DL(Sel); SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1); assert((NewCT.isUndef() || isConstantOrConstantVector(NewCT) || isConstantFPBuildVectorOrConstantFP(NewCT)) && "Failed to constant fold a binop with constant operands"); SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1); assert((NewCF.isUndef() || isConstantOrConstantVector(NewCF) || isConstantFPBuildVectorOrConstantFP(NewCF)) && "Failed to constant fold a binop with constant operands"); return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); } SDValue DAGCombiner::visitADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); SDLoc DL(N); // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (add x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; } // fold (add x, undef) -> undef if (N0.isUndef()) return N0; if (N1.isUndef()) return N1; if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { // canonicalize constant to RHS if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::ADD, DL, VT, N1, N0); // fold (add c1, c2) -> c1+c2 return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N0.getNode(), N1.getNode()); } // fold (add x, 0) -> x if (isNullConstant(N1)) return N0; if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) { // fold ((c1-A)+c2) -> (c1+c2)-A if (N0.getOpcode() == ISD::SUB && isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { // FIXME: Adding 2 constants should be handled by FoldConstantArithmetic. return DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), N0.getOperand(1)); } // add (sext i1 X), 1 -> zext (not i1 X) // We don't transform this pattern: // add (zext i1 X), -1 -> sext (not i1 X) // because most (?) targets generate better code for the zext form. if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && isOneConstantOrOneSplatConstant(N1)) { SDValue X = N0.getOperand(0); if ((!LegalOperations || (TLI.isOperationLegal(ISD::XOR, X.getValueType()) && TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) && X.getScalarValueSizeInBits() == 1) { SDValue Not = DAG.getNOT(DL, X, X.getValueType()); return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not); } } // Undo the add -> or combine to merge constant offsets from a frame index. if (N0.getOpcode() == ISD::OR && isa(N0.getOperand(0)) && isa(N0.getOperand(1)) && DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) { SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1)); return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0); } } if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // reassociate add if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1)) return RADD; // fold ((0-A) + B) -> B-A if (N0.getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N0.getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); // fold (A + (0-B)) -> A-B if (N1.getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1)); // fold (A+(B-A)) -> B if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1)) return N1.getOperand(0); // fold ((B-A)+A) -> B if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1)) return N0.getOperand(0); // fold (A+(B-(A+C))) to (B-C) if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && N0 == N1.getOperand(1).getOperand(0)) return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), N1.getOperand(1).getOperand(1)); // fold (A+(B-(C+A))) to (B-C) if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && N0 == N1.getOperand(1).getOperand(1)) return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), N1.getOperand(1).getOperand(0)); // fold (A+((B-A)+or-C)) to (B+or-C) if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) && N1.getOperand(0).getOpcode() == ISD::SUB && N0 == N1.getOperand(0).getOperand(1)) return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0), N1.getOperand(1)); // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDValue N10 = N1.getOperand(0); SDValue N11 = N1.getOperand(1); if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10)) return DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10), DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11)); } if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (a+b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); if (SDValue Combined = visitADDLike(N0, N1, N)) return Combined; if (SDValue Combined = visitADDLike(N1, N0, N)) return Combined; return SDValue(); } static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { bool Masked = false; // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization. while (true) { if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) { V = V.getOperand(0); continue; } if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) { Masked = true; V = V.getOperand(0); continue; } break; } // If this is not a carry, return. if (V.getResNo() != 1) return SDValue(); if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY && V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) return SDValue(); // If the result is masked, then no matter what kind of bool it is we can // return. If it isn't, then we need to make sure the bool type is either 0 or // 1 and not other values. if (Masked || TLI.getBooleanContents(V.getValueType()) == TargetLoweringBase::ZeroOrOneBooleanContent) return V; return SDValue(); } SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) { EVT VT = N0.getValueType(); SDLoc DL(LocReference); // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N0, DAG.getNode(ISD::SHL, DL, VT, N1.getOperand(0).getOperand(1), N1.getOperand(1))); if (N1.getOpcode() == ISD::AND) { SDValue AndOp0 = N1.getOperand(0); unsigned NumSignBits = DAG.ComputeNumSignBits(AndOp0); unsigned DestBits = VT.getScalarSizeInBits(); // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x)) // and similar xforms where the inner op is either ~0 or 0. if (NumSignBits == DestBits && isOneConstantOrOneSplatConstant(N1->getOperand(1))) return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0); } // add (sext i1), X -> sub X, (zext i1) if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.getOperand(0).getValueType() == MVT::i1 && !TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) { SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt); } // add X, (sextinreg Y i1) -> sub X, (and Y 1) if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { VTSDNode *TN = cast(N1.getOperand(1)); if (TN->getVT() == MVT::i1) { SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), DAG.getConstant(1, DL, VT)); return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt); } } // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) && N1.getResNo() == 0) return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), N0, N1.getOperand(0), N1.getOperand(2)); // (add X, Carry) -> (addcarry X, 0, Carry) if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) if (SDValue Carry = getAsCarry(TLI, N1)) return DAG.getNode(ISD::ADDCARRY, DL, DAG.getVTList(VT, Carry.getValueType()), N0, DAG.getConstant(0, DL, VT), Carry); return SDValue(); } SDValue DAGCombiner::visitADDC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); SDLoc DL(N); // If the flag result is dead, turn this into an ADD. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // canonicalize constant to RHS. ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && !N1C) return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0); // fold (addc x, 0) -> x + no carry out if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // If it cannot overflow, transform into an add. if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); return SDValue(); } SDValue DAGCombiner::visitUADDO(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); if (VT.isVector()) return SDValue(); EVT CarryVT = N->getValueType(1); SDLoc DL(N); // If the flag result is dead, turn this into an ADD. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), DAG.getUNDEF(CarryVT)); // canonicalize constant to RHS. ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && !N1C) return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0); // fold (uaddo x, 0) -> x + no carry out if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); // If it cannot overflow, transform into an add. if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), DAG.getConstant(0, DL, CarryVT)); if (SDValue Combined = visitUADDOLike(N0, N1, N)) return Combined; if (SDValue Combined = visitUADDOLike(N1, N0, N)) return Combined; return SDValue(); } SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { auto VT = N0.getValueType(); // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) // If Y + 1 cannot overflow. if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { SDValue Y = N1.getOperand(0); SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType()); if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never) return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y, N1.getOperand(2)); } // (uaddo X, Carry) -> (addcarry X, 0, Carry) if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) if (SDValue Carry = getAsCarry(TLI, N1)) return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, DAG.getConstant(0, SDLoc(N), VT), Carry); return SDValue(); } SDValue DAGCombiner::visitADDE(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); // canonicalize constant to RHS ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && !N1C) return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(), N1, N0, CarryIn); // fold (adde x, y, false) -> (addc x, y) if (CarryIn.getOpcode() == ISD::CARRY_FALSE) return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1); return SDValue(); } SDValue DAGCombiner::visitADDCARRY(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); SDLoc DL(N); // canonicalize constant to RHS ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && !N1C) return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); // fold (addcarry x, y, false) -> (uaddo x, y) if (isNullConstant(CarryIn)) return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. if (isNullConstant(N0) && isNullConstant(N1)) { EVT VT = N0.getValueType(); EVT CarryVT = CarryIn.getValueType(); SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT); AddToWorklist(CarryExt.getNode()); return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt, DAG.getConstant(1, DL, VT)), DAG.getConstant(0, DL, CarryVT)); } if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) return Combined; if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N)) return Combined; return SDValue(); } SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N) { // Iff the flag result is dead: // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry) if ((N0.getOpcode() == ISD::ADD || (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0)) && isNullConstant(N1) && !N->hasAnyUseOfValue(1)) return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0.getOperand(0), N0.getOperand(1), CarryIn); /** * When one of the addcarry argument is itself a carry, we may be facing * a diamond carry propagation. In which case we try to transform the DAG * to ensure linear carry propagation if that is possible. * * We are trying to get: * (addcarry X, 0, (addcarry A, B, Z):Carry) */ if (auto Y = getAsCarry(TLI, N1)) { /** * (uaddo A, B) * / \ * Carry Sum * | \ * | (addcarry *, 0, Z) * | / * \ Carry * | / * (addcarry X, *, *) */ if (Y.getOpcode() == ISD::UADDO && CarryIn.getResNo() == 1 && CarryIn.getOpcode() == ISD::ADDCARRY && isNullConstant(CarryIn.getOperand(1)) && CarryIn.getOperand(0) == Y.getValue(0)) { auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(), Y.getOperand(0), Y.getOperand(1), CarryIn.getOperand(2)); AddToWorklist(NewY.getNode()); return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, DAG.getConstant(0, SDLoc(N), N0.getValueType()), NewY.getValue(1)); } } return SDValue(); } // Since it may not be valid to emit a fold to zero for vector initializers // check if we can before folding. static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT, SelectionDAG &DAG, bool LegalOperations, bool LegalTypes) { if (!VT.isVector()) return DAG.getConstant(0, DL, VT); if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) return DAG.getConstant(0, DL, VT); return SDValue(); } SDValue DAGCombiner::visitSUB(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); SDLoc DL(N); // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (sub x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; } // fold (sub x, x) -> 0 // FIXME: Refactor this and xor and other similar operations together. if (N0 == N1) return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations, LegalTypes); if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // fold (sub c1, c2) -> c1-c2 return DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(), N1.getNode()); } if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); // fold (sub x, c) -> (add x, -c) if (N1C) { return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); } if (isNullConstantOrNullSplatConstant(N0)) { unsigned BitWidth = VT.getScalarSizeInBits(); // Right-shifting everything out but the sign bit followed by negation is // the same as flipping arithmetic/logical shift type without the negation: // -(X >>u 31) -> (X >>s 31) // -(X >>s 31) -> (X >>u 31) if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) { ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1)); if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1) { auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA; if (!LegalOperations || TLI.isOperationLegal(NewSh, VT)) return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1)); } } // 0 - X --> 0 if the sub is NUW. if (N->getFlags().hasNoUnsignedWrap()) return N0; if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) { // N1 is either 0 or the minimum signed value. If the sub is NSW, then // N1 must be 0 because negating the minimum signed value is undefined. if (N->getFlags().hasNoSignedWrap()) return N0; // 0 - X --> X if X is 0 or the minimum signed value. return N1; } } // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) if (isAllOnesConstantOrAllOnesSplatConstant(N0)) return DAG.getNode(ISD::XOR, DL, VT, N1, N0); // fold A-(A-B) -> B if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0)) return N1.getOperand(1); // fold (A+B)-A -> B if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1) return N0.getOperand(1); // fold (A+B)-B -> A if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1) return N0.getOperand(0); // fold C2-(A+C1) -> (C2-C1)-A if (N1.getOpcode() == ISD::ADD) { SDValue N11 = N1.getOperand(1); if (isConstantOrConstantVector(N0, /* NoOpaques */ true) && isConstantOrConstantVector(N11, /* NoOpaques */ true)) { SDValue NewC = DAG.getNode(ISD::SUB, DL, VT, N0, N11); return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0)); } } // fold ((A+(B+or-C))-B) -> A+or-C if (N0.getOpcode() == ISD::ADD && (N0.getOperand(1).getOpcode() == ISD::SUB || N0.getOperand(1).getOpcode() == ISD::ADD) && N0.getOperand(1).getOperand(0) == N1) return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0), N0.getOperand(1).getOperand(1)); // fold ((A+(C+B))-B) -> A+C if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD && N0.getOperand(1).getOperand(1) == N1) return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N0.getOperand(1).getOperand(0)); // fold ((A-(B-C))-C) -> A-B if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB && N0.getOperand(1).getOperand(1) == N1) return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N0.getOperand(1).getOperand(0)); // If either operand of a sub is undef, the result is undef if (N0.isUndef()) return N0; if (N1.isUndef()) return N1; // If the relocation model supports it, consider symbol offsets. if (GlobalAddressSDNode *GA = dyn_cast(N0)) if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) { // fold (sub Sym, c) -> Sym-c if (N1C && GA->getOpcode() == ISD::GlobalAddress) return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT, GA->getOffset() - (uint64_t)N1C->getSExtValue()); // fold (sub Sym+c1, Sym+c2) -> c1-c2 if (GlobalAddressSDNode *GB = dyn_cast(N1)) if (GA->getGlobal() == GB->getGlobal()) return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(), DL, VT); } // sub X, (sextinreg Y i1) -> add X, (and Y 1) if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { VTSDNode *TN = cast(N1.getOperand(1)); if (TN->getVT() == MVT::i1) { SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), DAG.getConstant(1, DL, VT)); return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt); } } return SDValue(); } SDValue DAGCombiner::visitSUBC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); SDLoc DL(N); // If the flag result is dead, turn this into an SUB. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // fold (subc x, x) -> 0 + no borrow if (N0 == N1) return CombineTo(N, DAG.getConstant(0, DL, VT), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // fold (subc x, 0) -> x + no borrow if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow if (isAllOnesConstant(N0)) return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); return SDValue(); } SDValue DAGCombiner::visitUSUBO(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); if (VT.isVector()) return SDValue(); EVT CarryVT = N->getValueType(1); SDLoc DL(N); // If the flag result is dead, turn this into an SUB. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), DAG.getUNDEF(CarryVT)); // fold (usubo x, x) -> 0 + no borrow if (N0 == N1) return CombineTo(N, DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, CarryVT)); // fold (usubo x, 0) -> x + no borrow if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow if (isAllOnesConstant(N0)) return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), DAG.getConstant(0, DL, CarryVT)); return SDValue(); } SDValue DAGCombiner::visitSUBE(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); // fold (sube x, y, false) -> (subc x, y) if (CarryIn.getOpcode() == ISD::CARRY_FALSE) return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1); return SDValue(); } SDValue DAGCombiner::visitSUBCARRY(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); // fold (subcarry x, y, false) -> (usubo x, y) if (isNullConstant(CarryIn)) return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1); return SDValue(); } SDValue DAGCombiner::visitMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); // fold (mul x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); bool N0IsConst = false; bool N1IsConst = false; bool N1IsOpaqueConst = false; bool N0IsOpaqueConst = false; APInt ConstValue0, ConstValue1; // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0); N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1); assert((!N0IsConst || ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) && "Splat APInt should be element width"); assert((!N1IsConst || ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) && "Splat APInt should be element width"); } else { N0IsConst = isa(N0); if (N0IsConst) { ConstValue0 = cast(N0)->getAPIntValue(); N0IsOpaqueConst = cast(N0)->isOpaque(); } N1IsConst = isa(N1); if (N1IsConst) { ConstValue1 = cast(N1)->getAPIntValue(); N1IsOpaqueConst = cast(N1)->isOpaque(); } } // fold (mul c1, c2) -> c1*c2 if (N0IsConst && N1IsConst && !N0IsOpaqueConst && !N1IsOpaqueConst) return DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, N0.getNode(), N1.getNode()); // canonicalize constant to RHS (vector doesn't have to splat) if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0); // fold (mul x, 0) -> 0 if (N1IsConst && ConstValue1.isNullValue()) return N1; // fold (mul x, 1) -> x if (N1IsConst && ConstValue1.isOneValue()) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (mul x, -1) -> 0-x if (N1IsConst && ConstValue1.isAllOnesValue()) { SDLoc DL(N); return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); } // fold (mul x, (1 << c)) -> x << c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1) && (!VT.isVector() || Level <= AfterLegalizeVectorOps)) { SDLoc DL(N); SDValue LogBase2 = BuildLogBase2(N1, DL); AddToWorklist(LogBase2.getNode()); EVT ShiftVT = getShiftAmountTy(N0.getValueType()); SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); AddToWorklist(Trunc.getNode()); return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc); } // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) { unsigned Log2Val = (-ConstValue1).logBase2(); SDLoc DL(N); // FIXME: If the input is something that is easily negated (e.g. a // single-use add), we should put the negate there. return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(Log2Val, DL, getShiftAmountTy(N0.getValueType())))); } // (mul (shl X, c1), c2) -> (mul X, c2 << c1) if (N0.getOpcode() == ISD::SHL && isConstantOrConstantVector(N1, /* NoOpaques */ true) && isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1)); if (isConstantOrConstantVector(C3)) return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3); } // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one // use. { SDValue Sh(nullptr, 0), Y(nullptr, 0); // Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)). if (N0.getOpcode() == ISD::SHL && isConstantOrConstantVector(N0.getOperand(1)) && N0.getNode()->hasOneUse()) { Sh = N0; Y = N1; } else if (N1.getOpcode() == ISD::SHL && isConstantOrConstantVector(N1.getOperand(1)) && N1.getNode()->hasOneUse()) { Sh = N1; Y = N0; } if (Sh.getNode()) { SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y); return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1)); } } // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) if (DAG.isConstantIntBuildVectorOrConstantInt(N1) && N0.getOpcode() == ISD::ADD && DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) && isMulAddWithConstProfitable(N, N0, N1)) return DAG.getNode(ISD::ADD, SDLoc(N), VT, DAG.getNode(ISD::MUL, SDLoc(N0), VT, N0.getOperand(0), N1), DAG.getNode(ISD::MUL, SDLoc(N1), VT, N0.getOperand(1), N1)); // reassociate mul if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1)) return RMUL; return SDValue(); } /// Return true if divmod libcall is available. static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned, const TargetLowering &TLI) { RTLIB::Libcall LC; EVT NodeType = Node->getValueType(0); if (!NodeType.isSimple()) return false; switch (NodeType.getSimpleVT().SimpleTy) { default: return false; // No libcall for vector types. case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break; } return TLI.getLibcallName(LC) != nullptr; } /// Issue divrem if both quotient and remainder are needed. SDValue DAGCombiner::useDivRem(SDNode *Node) { if (Node->use_empty()) return SDValue(); // This is a dead node, leave it alone. unsigned Opcode = Node->getOpcode(); bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM); unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; // DivMod lib calls can still work on non-legal types if using lib-calls. EVT VT = Node->getValueType(0); if (VT.isVector() || !VT.isInteger()) return SDValue(); if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT)) return SDValue(); // If DIVREM is going to get expanded into a libcall, // but there is no libcall available, then don't combine. if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) && !isDivRemLibcallAvailable(Node, isSigned, TLI)) return SDValue(); // If div is legal, it's better to do the normal expansion unsigned OtherOpcode = 0; if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) { OtherOpcode = isSigned ? ISD::SREM : ISD::UREM; if (TLI.isOperationLegalOrCustom(Opcode, VT)) return SDValue(); } else { OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV; if (TLI.isOperationLegalOrCustom(OtherOpcode, VT)) return SDValue(); } SDValue Op0 = Node->getOperand(0); SDValue Op1 = Node->getOperand(1); SDValue combined; for (SDNode::use_iterator UI = Op0.getNode()->use_begin(), UE = Op0.getNode()->use_end(); UI != UE;) { SDNode *User = *UI++; if (User == Node || User->use_empty()) continue; // Convert the other matching node(s), too; // otherwise, the DIVREM may get target-legalized into something // target-specific that we won't be able to recognize. unsigned UserOpc = User->getOpcode(); if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) && User->getOperand(0) == Op0 && User->getOperand(1) == Op1) { if (!combined) { if (UserOpc == OtherOpcode) { SDVTList VTs = DAG.getVTList(VT, VT); combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1); } else if (UserOpc == DivRemOpc) { combined = SDValue(User, 0); } else { assert(UserOpc == Opcode); continue; } } if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV) CombineTo(User, combined); else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM) CombineTo(User, combined.getValue(1)); } } return combined; } static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if (DAG.isUndef(N->getOpcode(), {N0, N1})) return DAG.getUNDEF(VT); // undef / X -> 0 // undef % X -> 0 if (N0.isUndef()) return DAG.getConstant(0, DL, VT); return SDValue(); } SDValue DAGCombiner::visitSDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; SDLoc DL(N); // fold (sdiv c1, c2) -> c1/c2 ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C); // fold (sdiv X, 1) -> X if (N1C && N1C->isOne()) return N0; // fold (sdiv X, -1) -> 0-X if (N1C && N1C->isAllOnesValue()) return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); if (SDValue V = simplifyDivRem(N, DAG)) return V; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // If we know the sign bits of both operands are zero, strength reduce to a // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1); // fold (sdiv X, pow2) -> simple ops after legalize // FIXME: We check for the exact bit here because the generic lowering gives // better results in that case. The target-specific lowering should learn how // to handle exact sdivs efficiently. if (N1C && !N1C->isNullValue() && !N1C->isOpaque() && !N->getFlags().hasExact() && (N1C->getAPIntValue().isPowerOf2() || (-N1C->getAPIntValue()).isPowerOf2())) { // Target-specific implementation of sdiv x, pow2. if (SDValue Res = BuildSDIVPow2(N)) return Res; unsigned lg2 = N1C->getAPIntValue().countTrailingZeros(); // Splat the sign bit into the register SDValue SGN = DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, getShiftAmountTy(N0.getValueType()))); AddToWorklist(SGN.getNode()); // Add (N0 < 0) ? abs2 - 1 : 0; SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, SGN, DAG.getConstant(VT.getScalarSizeInBits() - lg2, DL, getShiftAmountTy(SGN.getValueType()))); SDValue ADD = DAG.getNode(ISD::ADD, DL, VT, N0, SRL); AddToWorklist(SRL.getNode()); AddToWorklist(ADD.getNode()); // Divide by pow2 SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, ADD, DAG.getConstant(lg2, DL, getShiftAmountTy(ADD.getValueType()))); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. if (N1C->getAPIntValue().isNonNegative()) return SRA; AddToWorklist(SRA.getNode()); return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } // If integer divide is expensive and we satisfy the requirements, emit an // alternate sequence. Targets may check function attributes for size/speed // trade-offs. AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildSDIV(N)) return Op; // sdiv, srem -> sdivrem // If the divisor is constant, then return DIVREM only if isIntDivCheap() is // true. Otherwise, we break the simplification logic in visitREM(). if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue DivRem = useDivRem(N)) return DivRem; return SDValue(); } SDValue DAGCombiner::visitUDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; SDLoc DL(N); // fold (udiv c1, c2) -> c1/c2 ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C) if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, N0C, N1C)) return Folded; if (SDValue V = simplifyDivRem(N, DAG)) return V; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (udiv x, (1 << c)) -> x >>u c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1)) { SDValue LogBase2 = BuildLogBase2(N1, DL); AddToWorklist(LogBase2.getNode()); EVT ShiftVT = getShiftAmountTy(N0.getValueType()); SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); AddToWorklist(Trunc.getNode()); return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); } // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 if (N1.getOpcode() == ISD::SHL) { SDValue N10 = N1.getOperand(0); if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N10)) { SDValue LogBase2 = BuildLogBase2(N10, DL); AddToWorklist(LogBase2.getNode()); EVT ADDVT = N1.getOperand(1).getValueType(); SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT); AddToWorklist(Trunc.getNode()); SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::SRL, DL, VT, N0, Add); } } // fold (udiv x, c) -> alternate AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildUDIV(N)) return Op; // sdiv, srem -> sdivrem // If the divisor is constant, then return DIVREM only if isIntDivCheap() is // true. Otherwise, we break the simplification logic in visitREM(). if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue DivRem = useDivRem(N)) return DivRem; return SDValue(); } // handles ISD::SREM and ISD::UREM SDValue DAGCombiner::visitREM(SDNode *N) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); bool isSigned = (Opcode == ISD::SREM); SDLoc DL(N); // fold (rem c1, c2) -> c1%c2 ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C) if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C)) return Folded; if (SDValue V = simplifyDivRem(N, DAG)) return V; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; if (isSigned) { // If we know the sign bits of both operands are zero, strength reduce to a // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UREM, DL, VT, N0, N1); } else { SDValue NegOne = DAG.getAllOnesConstant(DL, VT); if (DAG.isKnownToBeAPowerOfTwo(N1)) { // fold (urem x, pow2) -> (and x, pow2-1) SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } if (N1.getOpcode() == ISD::SHL && DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } } AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. // To avoid mangling nodes, this simplification requires that the combine() // call for the speculative DIV must not cause a DIVREM conversion. We guard // against this by skipping the simplification if isIntDivCheap(). When // div is not cheap, combine will not return a DIVREM. Regardless, // checking cheapness here makes sense since the simplification results in // fatter code. if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr)) { unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; SDValue Div = DAG.getNode(DivOpcode, DL, VT, N0, N1); AddToWorklist(Div.getNode()); SDValue OptimizedDiv = combine(Div.getNode()); if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) { assert((OptimizedDiv.getOpcode() != ISD::UDIVREM) && (OptimizedDiv.getOpcode() != ISD::SDIVREM)); SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1); SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); AddToWorklist(Mul.getNode()); return Sub; } } // sdiv, srem -> sdivrem if (SDValue DivRem = useDivRem(N)) return DivRem.getValue(1); return SDValue(); } SDValue DAGCombiner::visitMULHS(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if (VT.isVector()) { // fold (mulhs x, 0) -> 0 if (ISD::isBuildVectorAllZeros(N1.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N0.getNode())) return N0; } // fold (mulhs x, 0) -> 0 if (isNullConstant(N1)) return N1; // fold (mulhs x, 1) -> (sra x, size(x)-1) if (isOneConstant(N1)) return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0, DAG.getConstant(N0.getValueSizeInBits() - 1, DL, getShiftAmountTy(N0.getValueType()))); // fold (mulhs x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); // If the type twice as wide is legal, transform the mulhs to a wider multiply // plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1); N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, DAG.getConstant(SimpleSize, DL, getShiftAmountTy(N1.getValueType()))); return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); } } return SDValue(); } SDValue DAGCombiner::visitMULHU(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if (VT.isVector()) { // fold (mulhu x, 0) -> 0 if (ISD::isBuildVectorAllZeros(N1.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N0.getNode())) return N0; } // fold (mulhu x, 0) -> 0 if (isNullConstant(N1)) return N1; // fold (mulhu x, 1) -> 0 if (isOneConstant(N1)) return DAG.getConstant(0, DL, N0.getValueType()); // fold (mulhu x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); // If the type twice as wide is legal, transform the mulhu to a wider multiply // plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1); N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, DAG.getConstant(SimpleSize, DL, getShiftAmountTy(N1.getValueType()))); return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); } } return SDValue(); } /// Perform optimizations common to nodes that compute two values. LoOp and HiOp /// give the opcodes for the two computations that are being performed. Return /// true if a simplification was made. SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp) { // If the high half is not needed, just compute the low half. bool HiExists = N->hasAnyUseOfValue(1); if (!HiExists && (!LegalOperations || TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) { SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); return CombineTo(N, Res, Res); } // If the low half is not needed, just compute the high half. bool LoExists = N->hasAnyUseOfValue(0); if (!LoExists && (!LegalOperations || TLI.isOperationLegal(HiOp, N->getValueType(1)))) { SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); return CombineTo(N, Res, Res); } // If both halves are used, return as it is. if (LoExists && HiExists) return SDValue(); // If the two computed results can be simplified separately, separate them. if (LoExists) { SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); AddToWorklist(Lo.getNode()); SDValue LoOpt = combine(Lo.getNode()); if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() && (!LegalOperations || TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType()))) return CombineTo(N, LoOpt, LoOpt); } if (HiExists) { SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); AddToWorklist(Hi.getNode()); SDValue HiOpt = combine(Hi.getNode()); if (HiOpt.getNode() && HiOpt != Hi && (!LegalOperations || TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType()))) return CombineTo(N, HiOpt, HiOpt); } return SDValue(); } SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS)) return Res; EVT VT = N->getValueType(0); SDLoc DL(N); // If the type is twice as wide is legal, transform the mulhu to a wider // multiply plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0)); SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1)); Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); // Compute the high part as N1. Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, DAG.getConstant(SimpleSize, DL, getShiftAmountTy(Lo.getValueType()))); Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); // Compute the low part as N0. Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); return CombineTo(N, Lo, Hi); } } return SDValue(); } SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU)) return Res; EVT VT = N->getValueType(0); SDLoc DL(N); // If the type is twice as wide is legal, transform the mulhu to a wider // multiply plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0)); SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1)); Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); // Compute the high part as N1. Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, DAG.getConstant(SimpleSize, DL, getShiftAmountTy(Lo.getValueType()))); Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); // Compute the low part as N0. Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); return CombineTo(N, Lo, Hi); } } return SDValue(); } SDValue DAGCombiner::visitSMULO(SDNode *N) { // (smulo x, 2) -> (saddo x, x) if (ConstantSDNode *C2 = dyn_cast(N->getOperand(1))) if (C2->getAPIntValue() == 2) return DAG.getNode(ISD::SADDO, SDLoc(N), N->getVTList(), N->getOperand(0), N->getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitUMULO(SDNode *N) { // (umulo x, 2) -> (uaddo x, x) if (ConstantSDNode *C2 = dyn_cast(N->getOperand(1))) if (C2->getAPIntValue() == 2) return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(), N->getOperand(0), N->getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitIMINMAX(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold operation with constant operands. ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); if (N0C && N1C) return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C); // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); return SDValue(); } /// If this is a binary operator with two operands of the same opcode, try to /// simplify it. SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); EVT VT = N0.getValueType(); assert(N0.getOpcode() == N1.getOpcode() && "Bad input!"); // Bail early if none of these transforms apply. if (N0.getNumOperands() == 0) return SDValue(); // For each of OP in AND/OR/XOR: // fold (OP (zext x), (zext y)) -> (zext (OP x, y)) // fold (OP (sext x), (sext y)) -> (sext (OP x, y)) // fold (OP (aext x), (aext y)) -> (aext (OP x, y)) // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y)) // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free) // // do not sink logical op inside of a vector extend, since it may combine // into a vsetcc. EVT Op0VT = N0.getOperand(0).getValueType(); if ((N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::BSWAP || // Avoid infinite looping with PromoteIntBinOp. (N0.getOpcode() == ISD::ANY_EXTEND && (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) || (N0.getOpcode() == ISD::TRUNCATE && (!TLI.isZExtFree(VT, Op0VT) || !TLI.isTruncateFree(Op0VT, VT)) && TLI.isTypeLegal(Op0VT))) && !VT.isVector() && Op0VT == N1.getOperand(0).getValueType() && (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) { SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), N0.getOperand(0).getValueType(), N0.getOperand(0), N1.getOperand(0)); AddToWorklist(ORNode.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode); } // For each of OP in SHL/SRL/SRA/AND... // fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z) // fold (or (OP x, z), (OP y, z)) -> (OP (or x, y), z) // fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z) if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) && N0.getOperand(1) == N1.getOperand(1)) { SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), N0.getOperand(0).getValueType(), N0.getOperand(0), N1.getOperand(0)); AddToWorklist(ORNode.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode, N0.getOperand(1)); } // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) // Only perform this optimization up until type legalization, before // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and // we don't want to undo this promotion. // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper // on scalars. if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR) && Level <= AfterLegalizeTypes) { SDValue In0 = N0.getOperand(0); SDValue In1 = N1.getOperand(0); EVT In0Ty = In0.getValueType(); EVT In1Ty = In1.getValueType(); SDLoc DL(N); // If both incoming values are integers, and the original types are the // same. if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) { SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1); SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op); AddToWorklist(Op.getNode()); return BC; } } // Xor/and/or are indifferent to the swizzle operation (shuffle of one value). // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B)) // If both shuffles use the same mask, and both shuffle within a single // vector, then it is worthwhile to move the swizzle after the operation. // The type-legalizer generates this pattern when loading illegal // vector types from memory. In many cases this allows additional shuffle // optimizations. // There are other cases where moving the shuffle after the xor/and/or // is profitable even if shuffles don't perform a swizzle. // If both shuffles use the same mask, and both shuffles have the same first // or second operand, then it might still be profitable to move the shuffle // after the xor/and/or operation. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { ShuffleVectorSDNode *SVN0 = cast(N0); ShuffleVectorSDNode *SVN1 = cast(N1); assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() && "Inputs to shuffles are not the same type"); // Check that both shuffles use the same mask. The masks are known to be of // the same length because the result vector type is the same. // Check also that shuffles have only one use to avoid introducing extra // instructions. if (SVN0->hasOneUse() && SVN1->hasOneUse() && SVN0->getMask().equals(SVN1->getMask())) { SDValue ShOp = N0->getOperand(1); // Don't try to fold this node if it requires introducing a // build vector of all zeros that might be illegal at this stage. if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) { if (!LegalTypes) ShOp = DAG.getConstant(0, SDLoc(N), VT); else ShOp = SDValue(); } // (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C) // (OR (shuf (A, C), shuf (B, C)) -> shuf (OR (A, B), C) // (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0) if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0->getOperand(0), N1->getOperand(0)); AddToWorklist(NewNode.getNode()); return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp, SVN0->getMask()); } // Don't try to fold this node if it requires introducing a // build vector of all zeros that might be illegal at this stage. ShOp = N0->getOperand(0); if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) { if (!LegalTypes) ShOp = DAG.getConstant(0, SDLoc(N), VT); else ShOp = SDValue(); } // (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B)) // (OR (shuf (C, A), shuf (C, B)) -> shuf (C, OR (A, B)) // (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B)) if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) { SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0->getOperand(1), N1->getOperand(1)); AddToWorklist(NewNode.getNode()); return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode, SVN0->getMask()); } } } return SDValue(); } /// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient. SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, const SDLoc &DL) { SDValue LL, LR, RL, RR, N0CC, N1CC; if (!isSetCCEquivalent(N0, LL, LR, N0CC) || !isSetCCEquivalent(N1, RL, RR, N1CC)) return SDValue(); assert(N0.getValueType() == N1.getValueType() && "Unexpected operand types for bitwise logic op"); assert(LL.getValueType() == LR.getValueType() && RL.getValueType() == RR.getValueType() && "Unexpected operand types for setcc"); // If we're here post-legalization or the logic op type is not i1, the logic // op type must match a setcc result type. Also, all folds require new // operations on the left and right operands, so those types must match. EVT VT = N0.getValueType(); EVT OpVT = LL.getValueType(); if (LegalOperations || VT != MVT::i1) if (VT != getSetCCResultType(OpVT)) return SDValue(); if (OpVT != RL.getValueType()) return SDValue(); ISD::CondCode CC0 = cast(N0CC)->get(); ISD::CondCode CC1 = cast(N1CC)->get(); bool IsInteger = OpVT.isInteger(); if (LR == RR && CC0 == CC1 && IsInteger) { bool IsZero = isNullConstantOrNullSplatConstant(LR); bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR); // All bits clear? bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; // All sign bits clear? bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1; // Any bits set? bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero; // Any sign bits set? bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero; // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0) // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1) // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0) // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0) if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) { SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL); AddToWorklist(Or.getNode()); return DAG.getSetCC(DL, VT, Or, LR, CC1); } // All bits set? bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1; // All sign bits set? bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero; // Any bits clear? bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1; // Any sign bits clear? bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1; // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1) // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0) // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1) // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1) if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) { SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL); AddToWorklist(And.getNode()); return DAG.getSetCC(DL, VT, And, LR, CC1); } } // TODO: What is the 'or' equivalent of this fold? // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) if (IsAnd && LL == RL && CC0 == CC1 && IsInteger && CC0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) || (isAllOnesConstant(LR) && isNullConstant(RR)))) { SDValue One = DAG.getConstant(1, DL, OpVT); SDValue Two = DAG.getConstant(2, DL, OpVT); SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One); AddToWorklist(Add.getNode()); return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE); } // Try more general transforms if the predicates match and the only user of // the compares is the 'and' or 'or'. if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 && N0.hasOneUse() && N1.hasOneUse()) { // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0 if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) { SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR); SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR); SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR); SDValue Zero = DAG.getConstant(0, DL, OpVT); return DAG.getSetCC(DL, VT, Or, Zero, CC1); } } // Canonicalize equivalent operands to LL == RL. if (LL == RR && LR == RL) { CC1 = ISD::getSetCCSwappedOperands(CC1); std::swap(RL, RR); } // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) if (LL == RL && LR == RR) { ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger) : ISD::getSetCCOrOperation(CC0, CC1, IsInteger); if (NewCC != ISD::SETCC_INVALID && (!LegalOperations || (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && TLI.isOperationLegal(ISD::SETCC, OpVT)))) return DAG.getSetCC(DL, VT, LL, LR, NewCC); } return SDValue(); } /// This contains all DAGCombine rules which reduce two values combined by /// an And operation to a single value. This makes them reusable in the context /// of visitSELECT(). Rules involving constants are not included as /// visitSELECT() already handles those cases. SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); SDLoc DL(N); // fold (and x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) return V; if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && VT.getSizeInBits() <= 64) { if (ConstantSDNode *ADDI = dyn_cast(N0.getOperand(1))) { APInt ADDC = ADDI->getAPIntValue(); if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) { // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal // immediate for an add, but it is legal if its top c2 bits are set, // transform the ADD so the immediate doesn't need to be materialized // in a register. if (ConstantSDNode *SRLI = dyn_cast(N1.getOperand(1))) { APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), SRLI->getZExtValue()); if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { ADDC |= Mask; if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { SDLoc DL0(N0); SDValue NewAdd = DAG.getNode(ISD::ADD, DL0, VT, N0.getOperand(0), DAG.getConstant(ADDC, DL, VT)); CombineTo(N0.getNode(), NewAdd); // Return N so it doesn't get rechecked! return SDValue(N, 0); } } } } } } // Reduce bit extract of low half of an integer to the narrower type. // (and (srl i64:x, K), KMask) -> // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask) if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { if (ConstantSDNode *CAnd = dyn_cast(N1)) { if (ConstantSDNode *CShift = dyn_cast(N0.getOperand(1))) { unsigned Size = VT.getSizeInBits(); const APInt &AndMask = CAnd->getAPIntValue(); unsigned ShiftBits = CShift->getZExtValue(); // Bail out, this node will probably disappear anyway. if (ShiftBits == 0) return SDValue(); unsigned MaskBits = AndMask.countTrailingOnes(); EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); if (AndMask.isMask() && // Required bits must not span the two halves of the integer and // must fit in the half size type. (ShiftBits + MaskBits <= Size / 2) && TLI.isNarrowingProfitable(VT, HalfVT) && TLI.isTypeDesirableForOp(ISD::AND, HalfVT) && TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) && TLI.isTruncateFree(VT, HalfVT) && TLI.isZExtFree(HalfVT, VT)) { // The isNarrowingProfitable is to avoid regressions on PPC and // AArch64 which match a few 64-bit bit insert / bit extract patterns // on downstream users of this. Those patterns could probably be // extended to handle extensions mixed in. SDValue SL(N0); assert(MaskBits <= Size); // Extracting the highest bit of the low half. EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout()); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT, N0.getOperand(0)); SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT); SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT); SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK); SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask); return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And); } } } } return SDValue(); } bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, EVT LoadResultTy, EVT &ExtVT) { if (!AndC->getAPIntValue().isMask()) return false; unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); EVT LoadedVT = LoadN->getMemoryVT(); if (ExtVT == LoadedVT && (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { // ZEXTLOAD will match without needing to change the size of the value being // loaded. return true; } // Do not change the width of a volatile load. if (LoadN->isVolatile()) return false; // Do not generate loads of non-round integer types since these can // be expensive (and would be wrong if the type is not byte sized). if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound()) return false; if (LegalOperations && !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT)) return false; if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT)) return false; return true; } bool DAGCombiner::isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType, EVT &ExtVT, unsigned ShAmt) { // Don't transform one with multiple uses, this would require adding a new // load. if (!SDValue(LoadN, 0).hasOneUse()) return false; if (LegalOperations && !TLI.isLoadExtLegal(ExtType, LoadN->getValueType(0), ExtVT)) return false; // Do not generate loads of non-round integer types since these can // be expensive (and would be wrong if the type is not byte sized). if (!ExtVT.isRound()) return false; // Don't change the width of a volatile load. if (LoadN->isVolatile()) return false; // Verify that we are actually reducing a load width here. if (LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits()) return false; // For the transform to be legal, the load must produce only two values // (the value loaded and the chain). Don't transform a pre-increment // load, for example, which produces an extra value. Otherwise the // transformation is not equivalent, and the downstream logic to replace // uses gets things wrong. if (LoadN->getNumValues() > 2) return false; // If the load that we're shrinking is an extload and we're not just // discarding the extension we can't simply shrink the load. Bail. // TODO: It would be possible to merge the extensions in some cases. if (LoadN->getExtensionType() != ISD::NON_EXTLOAD && LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt) return false; if (!TLI.shouldReduceLoadWidth(LoadN, ExtType, ExtVT)) return false; // It's not possible to generate a constant of extended or untyped type. EVT PtrType = LoadN->getOperand(1).getValueType(); if (PtrType == MVT::Untyped || PtrType.isExtended()) return false; return true; } bool DAGCombiner::SearchForAndLoads(SDNode *N, SmallPtrSetImpl &Loads, SmallPtrSetImpl &NodesWithConsts, ConstantSDNode *Mask, SDNode *&NodeToMask) { // Recursively search for the operands, looking for loads which can be // narrowed. for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) { SDValue Op = N->getOperand(i); if (Op.getValueType().isVector()) return false; // Some constants may need fixing up later if they are too large. if (auto *C = dyn_cast(Op)) { if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) && (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue()) NodesWithConsts.insert(N); continue; } if (!Op.hasOneUse()) return false; switch(Op.getOpcode()) { case ISD::LOAD: { auto *Load = cast(Op); EVT ExtVT; if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) { // Only add this load if we can make it more narrow. if (ExtVT.bitsLT(Load->getMemoryVT())) Loads.insert(Load); continue; } return false; } case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::AssertZext: { unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); EVT VT = Op.getOpcode() == ISD::AssertZext ? cast(Op.getOperand(1))->getVT() : Op.getOperand(0).getValueType(); // We can accept extending nodes if the mask is wider or an equal // width to the original type. if (ExtVT.bitsGE(VT)) continue; break; } case ISD::OR: case ISD::XOR: case ISD::AND: if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask, NodeToMask)) return false; continue; } // Allow one node which will masked along with any loads found. if (NodeToMask) return false; NodeToMask = Op.getNode(); } return true; } bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { auto *Mask = dyn_cast(N->getOperand(1)); if (!Mask) return false; if (!Mask->getAPIntValue().isMask()) return false; // No need to do anything if the and directly uses a load. if (isa(N->getOperand(0))) return false; SmallPtrSet Loads; SmallPtrSet NodesWithConsts; SDNode *FixupNode = nullptr; if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) { if (Loads.size() == 0) return false; SDValue MaskOp = N->getOperand(1); // If it exists, fixup the single node we allow in the tree that needs // masking. if (FixupNode) { SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), FixupNode->getValueType(0), SDValue(FixupNode, 0), MaskOp); DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And); DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp); } // Narrow any constants that need it. for (auto *LogicN : NodesWithConsts) { auto *C = cast(LogicN->getOperand(1)); SDValue And = DAG.getNode(ISD::AND, SDLoc(C), C->getValueType(0), SDValue(C, 0), MaskOp); DAG.UpdateNodeOperands(LogicN, LogicN->getOperand(0), And); } // Create narrow loads. for (auto *Load : Loads) { SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), SDValue(Load, 0), MaskOp); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp); SDValue NewLoad = ReduceLoadWidth(And.getNode()); assert(NewLoad && "Shouldn't be masking the load if it can't be narrowed"); CombineTo(Load, NewLoad, NewLoad.getValue(1)); } DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode()); return true; } return false; } SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N1.getValueType(); // x & x --> x if (N0 == N1) return N0; // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (and x, 0) -> 0, vector edition if (ISD::isBuildVectorAllZeros(N0.getNode())) // do not return N0, because undef node may exist in N0 return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()), SDLoc(N), N0.getValueType()); if (ISD::isBuildVectorAllZeros(N1.getNode())) // do not return N1, because undef node may exist in N1 return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()), SDLoc(N), N1.getValueType()); // fold (and x, -1) -> x, vector edition if (ISD::isBuildVectorAllOnes(N0.getNode())) return N1; if (ISD::isBuildVectorAllOnes(N1.getNode())) return N0; } // fold (and c1, c2) -> c1&c2 ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C); // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0); // fold (and x, -1) -> x if (isAllOnesConstant(N1)) return N0; // if (and x, c) is known to be zero, return 0 unsigned BitWidth = VT.getScalarSizeInBits(); if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(BitWidth))) return DAG.getConstant(0, SDLoc(N), VT); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // reassociate and if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1)) return RAND; + + // Try to convert a constant mask AND into a shuffle clear mask. + if (VT.isVector()) + if (SDValue Shuffle = XformToShuffleWithZero(N)) + return Shuffle; + // fold (and (or x, C), D) -> D if (C & D) == D auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); }; if (N0.getOpcode() == ISD::OR && matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset)) return N1; // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N0Op0 = N0.getOperand(0); APInt Mask = ~N1C->getAPIntValue(); Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits()); if (DAG.MaskedValueIsZero(N0Op0, Mask)) { SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N0.getValueType(), N0Op0); // Replace uses of the AND with uses of the Zero extend node. CombineTo(N, Zext); // We actually want to replace all uses of the any_extend with the // zero_extend, to avoid duplicating things. This will later cause this // AND to be folded. CombineTo(N0.getNode(), Zext); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) -> // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must // already be zero by virtue of the width of the base type of the load. // // the 'X' node here can either be nothing or an extract_vector_elt to catch // more cases. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() && N0.getOperand(0).getOpcode() == ISD::LOAD && N0.getOperand(0).getResNo() == 0) || (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) { LoadSDNode *Load = cast( (N0.getOpcode() == ISD::LOAD) ? N0 : N0.getOperand(0) ); // Get the constant (if applicable) the zero'th operand is being ANDed with. // This can be a pure constant or a vector splat, in which case we treat the // vector as a scalar and use the splat value. APInt Constant = APInt::getNullValue(1); if (const ConstantSDNode *C = dyn_cast(N1)) { Constant = C->getAPIntValue(); } else if (BuildVectorSDNode *Vector = dyn_cast(N1)) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs); if (IsSplat) { // Undef bits can contribute to a possible optimisation if set, so // set them. SplatValue |= SplatUndef; // The splat value may be something like "0x00FFFFFF", which means 0 for // the first vector value and FF for the rest, repeating. We need a mask // that will apply equally to all members of the vector, so AND all the // lanes of the constant together. EVT VT = Vector->getValueType(0); unsigned BitWidth = VT.getScalarSizeInBits(); // If the splat value has been compressed to a bitlength lower // than the size of the vector lane, we need to re-expand it to // the lane size. if (BitWidth > SplatBitSize) for (SplatValue = SplatValue.zextOrTrunc(BitWidth); SplatBitSize < BitWidth; SplatBitSize = SplatBitSize * 2) SplatValue |= SplatValue.shl(SplatBitSize); // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value. if (SplatBitSize % BitWidth == 0) { Constant = APInt::getAllOnesValue(BitWidth); for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i) Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth); } } } // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is // actually legal and isn't going to get expanded, else this is a false // optimisation. bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD, Load->getValueType(0), Load->getMemoryVT()); // Resize the constant to the same size as the original memory access before // extension. If it is still the AllOnesValue then this AND is completely // unneeded. Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits()); bool B; switch (Load->getExtensionType()) { default: B = false; break; case ISD::EXTLOAD: B = CanZextLoadProfitably; break; case ISD::ZEXTLOAD: case ISD::NON_EXTLOAD: B = true; break; } if (B && Constant.isAllOnesValue()) { // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to // preserve semantics once we get rid of the AND. SDValue NewLoad(Load, 0); // Fold the AND away. NewLoad may get replaced immediately. CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); if (Load->getExtensionType() == ISD::EXTLOAD) { NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD, Load->getValueType(0), SDLoc(Load), Load->getChain(), Load->getBasePtr(), Load->getOffset(), Load->getMemoryVT(), Load->getMemOperand()); // Replace uses of the EXTLOAD with the new ZEXTLOAD. if (Load->getNumValues() == 3) { // PRE/POST_INC loads have 3 values. SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1), NewLoad.getValue(2) }; CombineTo(Load, To, 3, true); } else { CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1)); } } return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (and (load x), 255) -> (zextload x, i8) // fold (and (extload x, i16), 255) -> (zextload x, i8) // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8) if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD || (N0.getOpcode() == ISD::ANY_EXTEND && N0.getOperand(0).getOpcode() == ISD::LOAD))) { if (SDValue Res = ReduceLoadWidth(N)) { LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND ? cast(N0.getOperand(0)) : cast(N0); AddToWorklist(N); CombineTo(LN0, Res, Res.getValue(1)); return SDValue(N, 0); } } if (Level >= AfterLegalizeTypes) { // Attempt to propagate the AND back up to the leaves which, if they're // loads, can be combined to narrow loads and the AND node can be removed. // Perform after legalization so that extend nodes will already be // combined into the loads. if (BackwardsPropagateMask(N, DAG)) { return SDValue(N, 0); } } if (SDValue Combined = visitANDLike(N0, N1, N)) return Combined; // Simplify: (and (op x...), (op y...)) -> (op (and x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) return Tmp; // Masking the negated extension of a boolean is just the zero-extended // boolean: // and (sub 0, zext(bool X)), 1 --> zext(bool X) // and (sub 0, sext(bool X)), 1 --> zext(bool X) // // Note: the SimplifyDemandedBits fold below can make an information-losing // transform, and then we have no way to find this better fold. if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) { if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) { SDValue SubRHS = N0.getOperand(1); if (SubRHS.getOpcode() == ISD::ZERO_EXTEND && SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) return SubRHS; if (SubRHS.getOpcode() == ISD::SIGN_EXTEND && SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0)); } } // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) // fold (and (sra)) -> (and (srl)) when possible. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (zext_inreg (extload x)) -> (zextload x) if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); // If we zero all the possible extended bits, then we can turn this into // a zextload if we are running before legalize or the operation is legal. unsigned BitWidth = N1.getScalarValueSizeInBits(); if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, BitWidth - MemVT.getScalarSizeInBits())) && ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); AddToWorklist(N); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); // If we zero all the possible extended bits, then we can turn this into // a zextload if we are running before legalize or the operation is legal. unsigned BitWidth = N1.getScalarValueSizeInBits(); if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, BitWidth - MemVT.getScalarSizeInBits())) && ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); AddToWorklist(N); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const) if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) { if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), N0.getOperand(1), false)) return BSwap; } return SDValue(); } /// Match (a >> 8) | (a << 8) as (bswap a) >> 16. SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits) { if (!LegalOperations) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16) return SDValue(); if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) return SDValue(); // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff) bool LookPassAnd0 = false; bool LookPassAnd1 = false; if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL) std::swap(N0, N1); if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() == ISD::AND) { if (!N0.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N01C = dyn_cast(N0.getOperand(1)); if (!N01C || N01C->getZExtValue() != 0xFF00) return SDValue(); N0 = N0.getOperand(0); LookPassAnd0 = true; } if (N1.getOpcode() == ISD::AND) { if (!N1.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C || N11C->getZExtValue() != 0xFF) return SDValue(); N1 = N1.getOperand(0); LookPassAnd1 = true; } if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) return SDValue(); if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N01C = dyn_cast(N0.getOperand(1)); ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N01C || !N11C) return SDValue(); if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8) return SDValue(); // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8) SDValue N00 = N0->getOperand(0); if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) { if (!N00.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N001C = dyn_cast(N00.getOperand(1)); if (!N001C || N001C->getZExtValue() != 0xFF) return SDValue(); N00 = N00.getOperand(0); LookPassAnd0 = true; } SDValue N10 = N1->getOperand(0); if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) { if (!N10.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N101C = dyn_cast(N10.getOperand(1)); if (!N101C || N101C->getZExtValue() != 0xFF00) return SDValue(); N10 = N10.getOperand(0); LookPassAnd1 = true; } if (N00 != N10) return SDValue(); // Make sure everything beyond the low halfword gets set to zero since the SRL // 16 will clear the top bits. unsigned OpSizeInBits = VT.getSizeInBits(); if (DemandHighBits && OpSizeInBits > 16) { // If the left-shift isn't masked out then the only way this is a bswap is // if all bits beyond the low 8 are 0. In that case the entire pattern // reduces to a left shift anyway: leave it for other parts of the combiner. if (!LookPassAnd0) return SDValue(); // However, if the right shift isn't masked out then it might be because // it's not needed. See if we can spot that too. if (!LookPassAnd1 && !DAG.MaskedValueIsZero( N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16))) return SDValue(); } SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00); if (OpSizeInBits > 16) { SDLoc DL(N); Res = DAG.getNode(ISD::SRL, DL, VT, Res, DAG.getConstant(OpSizeInBits - 16, DL, getShiftAmountTy(VT))); } return Res; } /// Return true if the specified node is an element that makes up a 32-bit /// packed halfword byteswap. /// ((x & 0x000000ff) << 8) | /// ((x & 0x0000ff00) >> 8) | /// ((x & 0x00ff0000) << 8) | /// ((x & 0xff000000) >> 8) static bool isBSwapHWordElement(SDValue N, MutableArrayRef Parts) { if (!N.getNode()->hasOneUse()) return false; unsigned Opc = N.getOpcode(); if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL) return false; SDValue N0 = N.getOperand(0); unsigned Opc0 = N0.getOpcode(); if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL) return false; ConstantSDNode *N1C = nullptr; // SHL or SRL: look upstream for AND mask operand if (Opc == ISD::AND) N1C = dyn_cast(N.getOperand(1)); else if (Opc0 == ISD::AND) N1C = dyn_cast(N0.getOperand(1)); if (!N1C) return false; unsigned MaskByteOffset; switch (N1C->getZExtValue()) { default: return false; case 0xFF: MaskByteOffset = 0; break; case 0xFF00: MaskByteOffset = 1; break; case 0xFF0000: MaskByteOffset = 2; break; case 0xFF000000: MaskByteOffset = 3; break; } // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00). if (Opc == ISD::AND) { if (MaskByteOffset == 0 || MaskByteOffset == 2) { // (x >> 8) & 0xff // (x >> 8) & 0xff0000 if (Opc0 != ISD::SRL) return false; ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } else { // (x << 8) & 0xff00 // (x << 8) & 0xff000000 if (Opc0 != ISD::SHL) return false; ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } } else if (Opc == ISD::SHL) { // (x & 0xff) << 8 // (x & 0xff0000) << 8 if (MaskByteOffset != 0 && MaskByteOffset != 2) return false; ConstantSDNode *C = dyn_cast(N.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } else { // Opc == ISD::SRL // (x & 0xff00) >> 8 // (x & 0xff000000) >> 8 if (MaskByteOffset != 1 && MaskByteOffset != 3) return false; ConstantSDNode *C = dyn_cast(N.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } if (Parts[MaskByteOffset]) return false; Parts[MaskByteOffset] = N0.getOperand(0).getNode(); return true; } /// Match a 32-bit packed halfword bswap. That is /// ((x & 0x000000ff) << 8) | /// ((x & 0x0000ff00) >> 8) | /// ((x & 0x00ff0000) << 8) | /// ((x & 0xff000000) >> 8) /// => (rotl (bswap x), 16) SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { if (!LegalOperations) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i32) return SDValue(); if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) return SDValue(); // Look for either // (or (or (and), (and)), (or (and), (and))) // (or (or (or (and), (and)), (and)), (and)) if (N0.getOpcode() != ISD::OR) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDNode *Parts[4] = {}; if (N1.getOpcode() == ISD::OR && N00.getNumOperands() == 2 && N01.getNumOperands() == 2) { // (or (or (and), (and)), (or (and), (and))) if (!isBSwapHWordElement(N00, Parts)) return SDValue(); if (!isBSwapHWordElement(N01, Parts)) return SDValue(); SDValue N10 = N1.getOperand(0); if (!isBSwapHWordElement(N10, Parts)) return SDValue(); SDValue N11 = N1.getOperand(1); if (!isBSwapHWordElement(N11, Parts)) return SDValue(); } else { // (or (or (or (and), (and)), (and)), (and)) if (!isBSwapHWordElement(N1, Parts)) return SDValue(); if (!isBSwapHWordElement(N01, Parts)) return SDValue(); if (N00.getOpcode() != ISD::OR) return SDValue(); SDValue N000 = N00.getOperand(0); if (!isBSwapHWordElement(N000, Parts)) return SDValue(); SDValue N001 = N00.getOperand(1); if (!isBSwapHWordElement(N001, Parts)) return SDValue(); } // Make sure the parts are all coming from the same node. if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3]) return SDValue(); SDLoc DL(N); SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, SDValue(Parts[0], 0)); // Result of the bswap should be rotated by 16. If it's not legal, then // do (x << 16) | (x >> 16). SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT)); if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT)) return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt); if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT)) return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt); return DAG.getNode(ISD::OR, DL, VT, DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt), DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt)); } /// This contains all DAGCombine rules which reduce two values combined by /// an Or operation to a single value \see visitANDLike(). SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); SDLoc DL(N); // fold (or x, undef) -> -1 if (!LegalOperations && (N0.isUndef() || N1.isUndef())) return DAG.getAllOnesConstant(DL, VT); if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL)) return V; // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible. if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && // Don't increase # computations. (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { // We can only do this xform if we know that bits from X that are set in C2 // but not in C1 are already zero. Likewise for Y. if (const ConstantSDNode *N0O1C = getAsNonOpaqueConstant(N0.getOperand(1))) { if (const ConstantSDNode *N1O1C = getAsNonOpaqueConstant(N1.getOperand(1))) { // We can only do this xform if we know that bits from X that are set in // C2 but not in C1 are already zero. Likewise for Y. const APInt &LHSMask = N0O1C->getAPIntValue(); const APInt &RHSMask = N1O1C->getAPIntValue(); if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) && DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1.getOperand(0)); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(LHSMask | RHSMask, DL, VT)); } } } } // (or (and X, M), (and X, N)) -> (and X, (or M, N)) if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && N0.getOperand(0) == N1.getOperand(0) && // Don't increase # computations. (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(1), N1.getOperand(1)); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X); } return SDValue(); } SDValue DAGCombiner::visitOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N1.getValueType(); // x | x --> x if (N0 == N1) return N0; // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (or x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; // fold (or x, -1) -> -1, vector edition if (ISD::isBuildVectorAllOnes(N0.getNode())) // do not return N0, because undef node may exist in N0 return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType()); if (ISD::isBuildVectorAllOnes(N1.getNode())) // do not return N1, because undef node may exist in N1 return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType()); // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask) // Do this only if the resulting shuffle is legal. if (isa(N0) && isa(N1) && // Avoid folding a node with illegal type. TLI.isTypeLegal(VT)) { bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode()); bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()); bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode()); // Ensure both shuffles have a zero input. if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) { assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!"); assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!"); const ShuffleVectorSDNode *SV0 = cast(N0); const ShuffleVectorSDNode *SV1 = cast(N1); bool CanFold = true; int NumElts = VT.getVectorNumElements(); SmallVector Mask(NumElts); for (int i = 0; i != NumElts; ++i) { int M0 = SV0->getMaskElt(i); int M1 = SV1->getMaskElt(i); // Determine if either index is pointing to a zero vector. bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts)); bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts)); // If one element is zero and the otherside is undef, keep undef. // This also handles the case that both are undef. if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) { Mask[i] = -1; continue; } // Make sure only one of the elements is zero. if (M0Zero == M1Zero) { CanFold = false; break; } assert((M0 >= 0 || M1 >= 0) && "Undef index!"); // We have a zero and non-zero element. If the non-zero came from // SV0 make the index a LHS index. If it came from SV1, make it // a RHS index. We need to mod by NumElts because we don't care // which operand it came from in the original shuffles. Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts; } if (CanFold) { SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0); SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0); bool LegalMask = TLI.isShuffleMaskLegal(Mask, VT); if (!LegalMask) { std::swap(NewLHS, NewRHS); ShuffleVectorSDNode::commuteMask(Mask); LegalMask = TLI.isShuffleMaskLegal(Mask, VT); } if (LegalMask) return DAG.getVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, Mask); } } } } // fold (or c1, c2) -> c1|c2 ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, N0C, N1C); // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0); // fold (or x, 0) -> x if (isNullConstant(N1)) return N0; // fold (or x, -1) -> -1 if (isAllOnesConstant(N1)) return N1; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (or x, c) -> c iff (x & ~c) == 0 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) return N1; if (SDValue Combined = visitORLike(N0, N1, N)) return Combined; // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) return BSwap; if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1)) return BSwap; // reassociate or if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1)) return ROR; // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) // iff (c1 & c2) != 0. auto MatchIntersect = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return LHS->getAPIntValue().intersects(RHS->getAPIntValue()); }; if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) { if (SDValue COR = DAG.FoldConstantArithmetic( ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) { SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1); AddToWorklist(IOR.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR); } } // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) return Tmp; // See if this is some rotate idiom. if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) return SDValue(Rot, 0); if (SDValue Load = MatchLoadCombine(N)) return Load; // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); } /// Match "(X shl/srl V1) & V2" where V2 may not be present. bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) { if (Op.getOpcode() == ISD::AND) { if (DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) { Mask = Op.getOperand(1); Op = Op.getOperand(0); } else { return false; } } if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) { Shift = Op; return true; } return false; } // Return true if we can prove that, whenever Neg and Pos are both in the // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that // for two opposing shifts shift1 and shift2 and a value X with OpBits bits: // // (or (shift1 X, Neg), (shift2 X, Pos)) // // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate // in direction shift1 by Neg. The range [0, EltSize) means that we only need // to consider shift amounts with defined behavior. static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) { // If EltSize is a power of 2 then: // // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1) // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize). // // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check // for the stronger condition: // // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A] // // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1) // we can just replace Neg with Neg' for the rest of the function. // // In other cases we check for the even stronger condition: // // Neg == EltSize - Pos [B] // // for all Neg and Pos. Note that the (or ...) then invokes undefined // behavior if Pos == 0 (and consequently Neg == EltSize). // // We could actually use [A] whenever EltSize is a power of 2, but the // only extra cases that it would match are those uninteresting ones // where Neg and Pos are never in range at the same time. E.g. for // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) // as well as (sub 32, Pos), but: // // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos)) // // always invokes undefined behavior for 32-bit X. // // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise. unsigned MaskLoBits = 0; if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) { if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) { if (NegC->getAPIntValue() == EltSize - 1) { Neg = Neg.getOperand(0); MaskLoBits = Log2_64(EltSize); } } } // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1. if (Neg.getOpcode() != ISD::SUB) return false; ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0)); if (!NegC) return false; SDValue NegOp1 = Neg.getOperand(1); // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with // Pos'. The truncation is redundant for the purpose of the equality. if (MaskLoBits && Pos.getOpcode() == ISD::AND) if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) if (PosC->getAPIntValue() == EltSize - 1) Pos = Pos.getOperand(0); // The condition we need is now: // // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask // // If NegOp1 == Pos then we need: // // EltSize & Mask == NegC & Mask // // (because "x & Mask" is a truncation and distributes through subtraction). APInt Width; if (Pos == NegOp1) Width = NegC->getAPIntValue(); // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC. // Then the condition we want to prove becomes: // // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask // // which, again because "x & Mask" is a truncation, becomes: // // NegC & Mask == (EltSize - PosC) & Mask // EltSize & Mask == (NegC + PosC) & Mask else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) { if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) Width = PosC->getAPIntValue() + NegC->getAPIntValue(); else return false; } else return false; // Now we just need to check that EltSize & Mask == Width & Mask. if (MaskLoBits) // EltSize & Mask is 0 since Mask is EltSize - 1. return Width.getLoBits(MaskLoBits) == 0; return Width == EltSize; } // A subroutine of MatchRotate used once we have found an OR of two opposite // shifts of Shifted. If Neg == - Pos then the OR reduces // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the // former being preferred if supported. InnerPos and InnerNeg are Pos and // Neg with outer conversions stripped away. SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, SDValue InnerPos, SDValue InnerNeg, unsigned PosOpcode, unsigned NegOpcode, const SDLoc &DL) { // fold (or (shl x, (*ext y)), // (srl x, (*ext (sub 32, y)))) -> // (rotl x, y) or (rotr x, (sub 32, y)) // // fold (or (shl x, (*ext (sub 32, y))), // (srl x, (*ext y))) -> // (rotr x, y) or (rotl x, (sub 32, y)) EVT VT = Shifted.getValueType(); if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) { bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, HasPos ? Pos : Neg).getNode(); } return nullptr; } // MatchRotate - Handle an 'or' of two operands. If this is one of the many // idioms for rotate, and if the target supports rotation instructions, generate // a rot[lr]. SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { // Must be a legal type. Expanded 'n promoted things won't work with rotates. EVT VT = LHS.getValueType(); if (!TLI.isTypeLegal(VT)) return nullptr; // The target must have at least one rotate flavor. bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT); bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT); if (!HasROTL && !HasROTR) return nullptr; // Check for truncated rotate. if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE && LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) { assert(LHS.getValueType() == RHS.getValueType()); if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) { return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), SDValue(Rot, 0)).getNode(); } } // Match "(X shl/srl V1) & V2" where V2 may not be present. SDValue LHSShift; // The shift. SDValue LHSMask; // AND value if any. if (!MatchRotateHalf(LHS, LHSShift, LHSMask)) return nullptr; // Not part of a rotate. SDValue RHSShift; // The shift. SDValue RHSMask; // AND value if any. if (!MatchRotateHalf(RHS, RHSShift, RHSMask)) return nullptr; // Not part of a rotate. if (LHSShift.getOperand(0) != RHSShift.getOperand(0)) return nullptr; // Not shifting the same value. if (LHSShift.getOpcode() == RHSShift.getOpcode()) return nullptr; // Shifts must disagree. // Canonicalize shl to left side in a shl/srl pair. if (RHSShift.getOpcode() == ISD::SHL) { std::swap(LHS, RHS); std::swap(LHSShift, RHSShift); std::swap(LHSMask, RHSMask); } unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue LHSShiftArg = LHSShift.getOperand(0); SDValue LHSShiftAmt = LHSShift.getOperand(1); SDValue RHSShiftArg = RHSShift.getOperand(0); SDValue RHSShiftAmt = RHSShift.getOperand(1); // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; }; if (matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); SDValue Mask = AllOnes; if (LHSMask.getNode()) { SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt); Mask = DAG.getNode(ISD::AND, DL, VT, Mask, DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits)); } if (RHSMask.getNode()) { SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt); Mask = DAG.getNode(ISD::AND, DL, VT, Mask, DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits)); } Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask); } return Rot.getNode(); } // If there is a mask here, and we have a variable shift, we can't be sure // that we're masking out the right stuff. if (LHSMask.getNode() || RHSMask.getNode()) return nullptr; // If the shift amount is sign/zext/any-extended just peel it off. SDValue LExtOp0 = LHSShiftAmt; SDValue RExtOp0 = RHSShiftAmt; if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || LHSShiftAmt.getOpcode() == ISD::TRUNCATE) && (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) { LExtOp0 = LHSShiftAmt.getOperand(0); RExtOp0 = RHSShiftAmt.getOperand(0); } SDNode *TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL); if (TryL) return TryL; SDNode *TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL); if (TryR) return TryR; return nullptr; } namespace { /// Represents known origin of an individual byte in load combine pattern. The /// value of the byte is either constant zero or comes from memory. struct ByteProvider { // For constant zero providers Load is set to nullptr. For memory providers // Load represents the node which loads the byte from memory. // ByteOffset is the offset of the byte in the value produced by the load. LoadSDNode *Load = nullptr; unsigned ByteOffset = 0; ByteProvider() = default; static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { return ByteProvider(Load, ByteOffset); } static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } bool isConstantZero() const { return !Load; } bool isMemory() const { return Load; } bool operator==(const ByteProvider &Other) const { return Other.Load == Load && Other.ByteOffset == ByteOffset; } private: ByteProvider(LoadSDNode *Load, unsigned ByteOffset) : Load(Load), ByteOffset(ByteOffset) {} }; } // end anonymous namespace /// Recursively traverses the expression calculating the origin of the requested /// byte of the given value. Returns None if the provider can't be calculated. /// /// For all the values except the root of the expression verifies that the value /// has exactly one use and if it's not true return None. This way if the origin /// of the byte is returned it's guaranteed that the values which contribute to /// the byte are not used outside of this expression. /// /// Because the parts of the expression are not allowed to have more than one /// use this function iterates over trees, not DAGs. So it never visits the same /// node more than once. static const Optional calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, bool Root = false) { // Typical i64 by i8 pattern requires recursion up to 8 calls depth if (Depth == 10) return None; if (!Root && !Op.hasOneUse()) return None; assert(Op.getValueType().isScalarInteger() && "can't handle other types"); unsigned BitWidth = Op.getValueSizeInBits(); if (BitWidth % 8 != 0) return None; unsigned ByteWidth = BitWidth / 8; assert(Index < ByteWidth && "invalid index requested"); (void) ByteWidth; switch (Op.getOpcode()) { case ISD::OR: { auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); if (!LHS) return None; auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); if (!RHS) return None; if (LHS->isConstantZero()) return RHS; if (RHS->isConstantZero()) return LHS; return None; } case ISD::SHL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); if (!ShiftOp) return None; uint64_t BitShift = ShiftOp->getZExtValue(); if (BitShift % 8 != 0) return None; uint64_t ByteShift = BitShift / 8; return Index < ByteShift ? ByteProvider::getConstantZero() : calculateByteProvider(Op->getOperand(0), Index - ByteShift, Depth + 1); } case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { SDValue NarrowOp = Op->getOperand(0); unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); if (NarrowBitWidth % 8 != 0) return None; uint64_t NarrowByteWidth = NarrowBitWidth / 8; if (Index >= NarrowByteWidth) return Op.getOpcode() == ISD::ZERO_EXTEND ? Optional(ByteProvider::getConstantZero()) : None; return calculateByteProvider(NarrowOp, Index, Depth + 1); } case ISD::BSWAP: return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, Depth + 1); case ISD::LOAD: { auto L = cast(Op.getNode()); if (L->isVolatile() || L->isIndexed()) return None; unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); if (NarrowBitWidth % 8 != 0) return None; uint64_t NarrowByteWidth = NarrowBitWidth / 8; if (Index >= NarrowByteWidth) return L->getExtensionType() == ISD::ZEXTLOAD ? Optional(ByteProvider::getConstantZero()) : None; return ByteProvider::getMemory(L, Index); } } return None; } /// Match a pattern where a wide type scalar value is loaded by several narrow /// loads and combined by shifts and ors. Fold it into a single load or a load /// and a BSWAP if the targets supports it. /// /// Assuming little endian target: /// i8 *a = ... /// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) /// => /// i32 val = *((i32)a) /// /// i8 *a = ... /// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] /// => /// i32 val = BSWAP(*((i32)a)) /// /// TODO: This rule matches complex patterns with OR node roots and doesn't /// interact well with the worklist mechanism. When a part of the pattern is /// updated (e.g. one of the loads) its direct users are put into the worklist, /// but the root node of the pattern which triggers the load combine is not /// necessarily a direct user of the changed node. For example, once the address /// of t28 load is reassociated load combine won't be triggered: /// t25: i32 = add t4, Constant:i32<2> /// t26: i64 = sign_extend t25 /// t27: i64 = add t2, t26 /// t28: i8,ch = load t0, t27, undef:i64 /// t29: i32 = zero_extend t28 /// t32: i32 = shl t29, Constant:i8<8> /// t33: i32 = or t23, t32 /// As a possible fix visitLoad can check if the load can be a part of a load /// combine pattern and add corresponding OR roots to the worklist. SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { assert(N->getOpcode() == ISD::OR && "Can only match load combining against OR nodes"); // Handles simple types only EVT VT = N->getValueType(0); if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); unsigned ByteWidth = VT.getSizeInBits() / 8; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Before legalize we can introduce too wide illegal loads which will be later // split into legal sized loads. This enables us to combine i64 load by i8 // patterns to a couple of i32 loads on 32 bit targets. if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); std::function LittleEndianByteAt = []( unsigned BW, unsigned i) { return i; }; std::function BigEndianByteAt = []( unsigned BW, unsigned i) { return BW - i - 1; }; bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { assert(P.isMemory() && "Must be a memory byte provider"); unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); assert(LoadBitWidth % 8 == 0 && "can only analyze providers for individual bytes not bit"); unsigned LoadByteWidth = LoadBitWidth / 8; return IsBigEndianTarget ? BigEndianByteAt(LoadByteWidth, P.ByteOffset) : LittleEndianByteAt(LoadByteWidth, P.ByteOffset); }; Optional Base; SDValue Chain; SmallSet Loads; Optional FirstByteProvider; int64_t FirstOffset = INT64_MAX; // Check if all the bytes of the OR we are looking at are loaded from the same // base address. Collect bytes offsets from Base address in ByteOffsets. SmallVector ByteOffsets(ByteWidth); for (unsigned i = 0; i < ByteWidth; i++) { auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); if (!P || !P->isMemory()) // All the bytes must be loaded from memory return SDValue(); LoadSDNode *L = P->Load; assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() && "Must be enforced by calculateByteProvider"); assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); // All loads must share the same chain SDValue LChain = L->getChain(); if (!Chain) Chain = LChain; else if (Chain != LChain) return SDValue(); // Loads must share the same base address BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG); int64_t ByteOffsetFromBase = 0; if (!Base) Base = Ptr; else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) return SDValue(); // Calculate the offset of the current byte from the base address ByteOffsetFromBase += MemoryByteOffset(*P); ByteOffsets[i] = ByteOffsetFromBase; // Remember the first byte load if (ByteOffsetFromBase < FirstOffset) { FirstByteProvider = P; FirstOffset = ByteOffsetFromBase; } Loads.insert(L); } assert(!Loads.empty() && "All the bytes of the value must be loaded from " "memory, so there must be at least one load which produces the value"); assert(Base && "Base address of the accessed memory location must be set"); assert(FirstOffset != INT64_MAX && "First byte offset must be set"); // Check if the bytes of the OR we are looking at match with either big or // little endian value load bool BigEndian = true, LittleEndian = true; for (unsigned i = 0; i < ByteWidth; i++) { int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i); BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i); if (!BigEndian && !LittleEndian) return SDValue(); } assert((BigEndian != LittleEndian) && "should be either or"); assert(FirstByteProvider && "must be set"); // Ensure that the first byte is loaded from zero offset of the first load. // So the combined value can be loaded from the first load address. if (MemoryByteOffset(*FirstByteProvider) != 0) return SDValue(); LoadSDNode *FirstLoad = FirstByteProvider->Load; // The node we are looking at matches with the pattern, check if we can // replace it with a single load and bswap if needed. // If the load needs byte swap check if the target supports it bool NeedsBswap = IsBigEndianTarget != BigEndian; // Before legalize we can introduce illegal bswaps which will be later // converted to an explicit bswap sequence. This way we end up with a single // load and byte shuffling instead of several loads and byte shuffling. if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) return SDValue(); // Check that a load of the wide type is both allowed and fast on the target bool Fast = false; bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, FirstLoad->getAddressSpace(), FirstLoad->getAlignment(), &Fast); if (!Allowed || !Fast) return SDValue(); SDValue NewLoad = DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); // Transfer chain users from old loads to the new load. for (LoadSDNode *L : Loads) DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; } SDValue DAGCombiner::visitXOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (xor x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; } // fold (xor undef, undef) -> 0. This is a common idiom (misuse). if (N0.isUndef() && N1.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); // fold (xor x, undef) -> undef if (N0.isUndef()) return N0; if (N1.isUndef()) return N1; // fold (xor c1, c2) -> c1^c2 ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); if (N0C && N1C) return DAG.FoldConstantArithmetic(ISD::XOR, SDLoc(N), VT, N0C, N1C); // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0); // fold (xor x, 0) -> x if (isNullConstant(N1)) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // reassociate xor if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1)) return RXOR; // fold !(x cc y) -> (x !cc y) SDValue LHS, RHS, CC; if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) { bool isInt = LHS.getValueType().isInteger(); ISD::CondCode NotCC = ISD::getSetCCInverse(cast(CC)->get(), isInt); if (!LegalOperations || TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) { switch (N0.getOpcode()) { default: llvm_unreachable("Unhandled SetCC Equivalent!"); case ISD::SETCC: return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC); case ISD::SELECT_CC: return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2), N0.getOperand(3), NotCC); } } } // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y))) if (isOneConstant(N1) && N0.getOpcode() == ISD::ZERO_EXTEND && N0.getNode()->hasOneUse() && isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){ SDValue V = N0.getOperand(0); SDLoc DL(N0); V = DAG.getNode(ISD::XOR, DL, V.getValueType(), V, DAG.getConstant(1, DL, V.getValueType())); AddToWorklist(V.getNode()); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V); } // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc if (isOneConstant(N1) && VT == MVT::i1 && (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) { unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } } // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants if (isAllOnesConstant(N1) && (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); if (isa(RHS) || isa(LHS)) { unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } } // fold (xor (and x, y), y) -> (and (not x), y) if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && N0->getOperand(1) == N1) { SDValue X = N0->getOperand(0); SDValue NotX = DAG.getNOT(SDLoc(X), X, VT); AddToWorklist(NotX.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1); } // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) unsigned OpSizeInBits = VT.getScalarSizeInBits(); if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) && TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) if (C->getAPIntValue() == (OpSizeInBits - 1)) return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0)); } // fold (xor x, x) -> 0 if (N0 == N1) return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes); // fold (xor (shl 1, x), -1) -> (rotl ~1, x) // Here is a concrete example of this equivalence: // i16 x == 14 // i16 shl == 1 << 14 == 16384 == 0b0100000000000000 // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111 // // => // // i16 ~1 == 0b1111111111111110 // i16 rol(~1, 14) == 0b1011111111111111 // // Some additional tips to help conceptualize this transform: // - Try to see the operation as placing a single zero in a value of all ones. // - There exists no value for x which would allow the result to contain zero. // - Values of x larger than the bitwidth are undefined and do not require a // consistent result. // - Pushing the zero left requires shifting one bits in from the right. // A rotate left of ~1 is a nice way of achieving the desired result. if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0.getOpcode() == ISD::SHL && isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) { SDLoc DL(N); return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT), N0.getOperand(1)); } // Simplify: xor (op x...), (op y...) -> (op (xor x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) return Tmp; // Simplify the expression using non-local knowledge. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); } /// Handle transforms common to the three shifts, when the shift amount is a /// constant. SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) { SDNode *LHS = N->getOperand(0).getNode(); if (!LHS->hasOneUse()) return SDValue(); // We want to pull some binops through shifts, so that we have (and (shift)) // instead of (shift (and)), likewise for add, or, xor, etc. This sort of // thing happens with address calculations, so it's important to canonicalize // it. bool HighBitSet = false; // Can we transform this if the high bit is set? switch (LHS->getOpcode()) { default: return SDValue(); case ISD::OR: case ISD::XOR: HighBitSet = false; // We can only transform sra if the high bit is clear. break; case ISD::AND: HighBitSet = true; // We can only transform sra if the high bit is set. break; case ISD::ADD: if (N->getOpcode() != ISD::SHL) return SDValue(); // only shl(add) not sr[al](add). HighBitSet = false; // We can only transform sra if the high bit is clear. break; } // We require the RHS of the binop to be a constant and not opaque as well. ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS->getOperand(1)); if (!BinOpCst) return SDValue(); // FIXME: disable this unless the input to the binop is a shift by a constant // or is copy/select.Enable this in other cases when figure out it's exactly profitable. SDNode *BinOpLHSVal = LHS->getOperand(0).getNode(); bool isShift = BinOpLHSVal->getOpcode() == ISD::SHL || BinOpLHSVal->getOpcode() == ISD::SRA || BinOpLHSVal->getOpcode() == ISD::SRL; bool isCopyOrSelect = BinOpLHSVal->getOpcode() == ISD::CopyFromReg || BinOpLHSVal->getOpcode() == ISD::SELECT; if ((!isShift || !isa(BinOpLHSVal->getOperand(1))) && !isCopyOrSelect) return SDValue(); if (isCopyOrSelect && N->hasOneUse()) return SDValue(); EVT VT = N->getValueType(0); // If this is a signed shift right, and the high bit is modified by the // logical operation, do not perform the transformation. The highBitSet // boolean indicates the value of the high bit of the constant which would // cause it to be modified for this operation. if (N->getOpcode() == ISD::SRA) { bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative(); if (BinOpRHSSignSet != HighBitSet) return SDValue(); } if (!TLI.isDesirableToCommuteWithShift(LHS)) return SDValue(); // Fold the constants, shifting the binop RHS by the shift amount. SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)), N->getValueType(0), LHS->getOperand(1), N->getOperand(1)); assert(isa(NewRHS) && "Folding was not successful!"); // Create the new shift. SDValue NewShift = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(0)), VT, LHS->getOperand(0), N->getOperand(1)); // Create the new binop. return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS); } SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { assert(N->getOpcode() == ISD::TRUNCATE); assert(N->getOperand(0).getOpcode() == ISD::AND); // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) if (N->hasOneUse() && N->getOperand(0).hasOneUse()) { SDValue N01 = N->getOperand(0).getOperand(1); if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) { SDLoc DL(N); EVT TruncVT = N->getValueType(0); SDValue N00 = N->getOperand(0).getOperand(0); SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00); SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01); AddToWorklist(Trunc00.getNode()); AddToWorklist(Trunc01.getNode()); return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01); } } return SDValue(); } SDValue DAGCombiner::visitRotate(SDNode *N) { SDLoc dl(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); unsigned Bitsize = VT.getScalarSizeInBits(); // fold (rot x, 0) -> x if (isNullConstantOrNullSplatConstant(N1)) return N0; // fold (rot x, c) -> (rot x, c % BitSize) if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) { if (Cst->getAPIntValue().uge(Bitsize)) { uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize); return DAG.getNode(N->getOpcode(), dl, VT, N0, DAG.getConstant(RotAmt, dl, N1.getValueType())); } } // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1); } unsigned NextOp = N0.getOpcode(); // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { EVT ShiftVT = C1->getValueType(0); bool SameSide = (N->getOpcode() == NextOp); unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; if (SDValue CombinedShift = DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) { SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic( ISD::SREM, dl, ShiftVT, CombinedShift.getNode(), BitsizeC.getNode()); return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0), CombinedShiftNorm); } } } return SDValue(); } SDValue DAGCombiner::visitSHL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; BuildVectorSDNode *N1CV = dyn_cast(N1); // If setcc produces all-one true value then: // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<isConstant()) { if (N0.getOpcode() == ISD::AND) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); BuildVectorSDNode *N01CV = dyn_cast(N01); if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC && TLI.getBooleanContents(N00.getOperand(0).getValueType()) == TargetLowering::ZeroOrNegativeOneBooleanContent) { if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N01CV, N1CV)) return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C); } } } } ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (shl c1, c2) -> c1<isOpaque()) return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C); // fold (shl 0, x) -> 0 if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (shl x, c >= size(x)) -> undef // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { return Val->getAPIntValue().uge(OpSizeInBits); }; if (matchUnaryPredicate(N1, MatchShiftTooBig)) return DAG.getUNDEF(VT); // fold (shl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; // fold (shl undef, x) -> 0 if (N0.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // if (shl x, c) is known to be zero, return 0 if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, SDLoc(N), VT); // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); } if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2)) if (N0.getOpcode() == ISD::SHL) { auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).uge(OpSizeInBits); }; if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) return DAG.getConstant(0, SDLoc(N), VT); auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).ult(OpSizeInBits); }; if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { SDLoc DL(N); EVT ShiftVT = N1.getValueType(); SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum); } } // fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2))) // For this to be valid, the second form must not preserve any of the bits // that are shifted out by the inner shift in the first form. This means // the outer shift size must be >= the number of bits added by the ext. // As a corollary, we don't care what kind of ext it is. if (N1C && (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND) && N0.getOperand(0).getOpcode() == ISD::SHL) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) { APInt c1 = N0Op0C1->getAPIntValue(); APInt c2 = N1C->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); EVT InnerShiftVT = N0Op0.getValueType(); uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); if (c2.uge(OpSizeInBits - InnerShiftSize)) { SDLoc DL(N0); APInt Sum = c1 + c2; if (Sum.uge(OpSizeInBits)) return DAG.getConstant(0, DL, VT); return DAG.getNode( ISD::SHL, DL, VT, DAG.getNode(N0.getOpcode(), DL, VT, N0Op0->getOperand(0)), DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType())); } } } // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C)) // Only fold this if the inner zext has no other uses to avoid increasing // the total number of instructions. if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::SRL) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) { if (N0Op0C1->getAPIntValue().ult(VT.getScalarSizeInBits())) { uint64_t c1 = N0Op0C1->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); if (c1 == c2) { SDValue NewOp0 = N0.getOperand(0); EVT CountVT = NewOp0.getOperand(1).getValueType(); SDLoc DL(N); SDValue NewSHL = DAG.getNode(ISD::SHL, DL, NewOp0.getValueType(), NewOp0, DAG.getConstant(c2, DL, CountVT)); AddToWorklist(NewSHL.getNode()); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL); } } } } // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && N0->getFlags().hasExact()) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { uint64_t C1 = N0C1->getZExtValue(); uint64_t C2 = N1C->getZExtValue(); SDLoc DL(N); if (C1 <= C2) return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), DAG.getConstant(C2 - C1, DL, N1.getValueType())); return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), DAG.getConstant(C1 - C2, DL, N1.getValueType())); } } // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or // (and (srl x, (sub c1, c2), MASK) // Only fold this if the inner shift has no other uses -- if it does, folding // this will increase the total number of instructions. if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { uint64_t c1 = N0C1->getZExtValue(); if (c1 < OpSizeInBits) { uint64_t c2 = N1C->getZExtValue(); APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); SDValue Shift; if (c2 > c1) { Mask <<= c2 - c1; SDLoc DL(N); Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), DAG.getConstant(c2 - c1, DL, N1.getValueType())); } else { Mask.lshrInPlace(c1 - c2); SDLoc DL(N); Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), DAG.getConstant(c1 - c2, DL, N1.getValueType())); } SDLoc DL(N0); return DAG.getNode(ISD::AND, DL, VT, Shift, DAG.getConstant(Mask, DL, VT)); } } } // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) && isConstantOrConstantVector(N1, /* No Opaques */ true)) { SDLoc DL(N); SDValue AllBits = DAG.getAllOnesConstant(DL, VT); SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask); } // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) // Variant of version done on multiply, except mul by a power of 2 is turned // into a shift. if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) && N0.getNode()->hasOneUse() && isConstantOrConstantVector(N1, /* No Opaques */ true) && isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) { SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1); SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); AddToWorklist(Shl0.getNode()); AddToWorklist(Shl1.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1); } // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() && isConstantOrConstantVector(N1, /* No Opaques */ true) && isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) { SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); if (isConstantOrConstantVector(Shl)) return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl); } if (N1C && !N1C->isOpaque()) if (SDValue NewSHL = visitShiftByConstant(N, N1C)) return NewSHL; return SDValue(); } SDValue DAGCombiner::visitSRA(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); // Arithmetic shifting an all-sign-bit value is a no-op. // fold (sra 0, x) -> 0 // fold (sra -1, x) -> -1 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits) return N0; // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (sra c1, c2) -> (sra c1, c2) ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C); // fold (sra x, c >= size(x)) -> undef // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { return Val->getAPIntValue().uge(OpSizeInBits); }; if (matchUnaryPredicate(N1, MatchShiftTooBig)) return DAG.getUNDEF(VT); // fold (sra x, 0) -> x if (N1C && N1C->isNullValue()) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports // sext_inreg. if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue(); EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits); if (VT.isVector()) ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, VT.getVectorNumElements()); if ((!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT))) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), DAG.getValueType(ExtVT)); } // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) if (N0.getOpcode() == ISD::SRA) { SDLoc DL(N); EVT ShiftVT = N1.getValueType(); auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).uge(OpSizeInBits); }; if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT)); auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).ult(OpSizeInBits); }; if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum); } } // fold (sra (shl X, m), (sub result_size, n)) // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for // result_size - n != m. // If truncate is free for the target sext(shl) is likely to result in better // code. if (N0.getOpcode() == ISD::SHL && N1C) { // Get the two constanst of the shifts, CN0 = m, CN = n. const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1)); if (N01C) { LLVMContext &Ctx = *DAG.getContext(); // Determine what the truncate's result bitsize and type would be. EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue()); if (VT.isVector()) TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements()); // Determine the residual right-shift amount. int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue(); // If the shift is not a no-op (in which case this should be just a sign // extend already), the truncated to type is legal, sign_extend is legal // on that type, and the truncate to that type is both legal and free, // perform the transform. if ((ShiftAmt > 0) && TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) && TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) && TLI.isTruncateFree(VT, TruncVT)) { SDLoc DL(N); SDValue Amt = DAG.getConstant(ShiftAmt, DL, getShiftAmountTy(N0.getOperand(0).getValueType())); SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Amt); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Shift); return DAG.getNode(ISD::SIGN_EXTEND, DL, N->getValueType(0), Trunc); } } } // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); } // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) // if c1 is equal to the number of bits the trunc removes if (N0.getOpcode() == ISD::TRUNCATE && (N0.getOperand(0).getOpcode() == ISD::SRL || N0.getOperand(0).getOpcode() == ISD::SRA) && N0.getOperand(0).hasOneUse() && N0.getOperand(0).getOperand(1).hasOneUse() && N1C) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) { unsigned LargeShiftVal = LargeShift->getZExtValue(); EVT LargeVT = N0Op0.getValueType(); if (LargeVT.getScalarSizeInBits() - OpSizeInBits == LargeShiftVal) { SDLoc DL(N); SDValue Amt = DAG.getConstant(LargeShiftVal + N1C->getZExtValue(), DL, getShiftAmountTy(N0Op0.getOperand(0).getValueType())); SDValue SRA = DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt); return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA); } } } // Simplify, based on bits shifted out of the LHS. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // If the sign bit is known to be zero, switch this to a SRL. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); if (N1C && !N1C->isOpaque()) if (SDValue NewSRA = visitShiftByConstant(N, N1C)) return NewSRA; return SDValue(); } SDValue DAGCombiner::visitSRL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (srl c1, c2) -> c1 >>u c2 ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C); // fold (srl 0, x) -> 0 if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (srl x, c >= size(x)) -> undef // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { return Val->getAPIntValue().uge(OpSizeInBits); }; if (matchUnaryPredicate(N1, MatchShiftTooBig)) return DAG.getUNDEF(VT); // fold (srl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // if (srl x, c) is known to be zero, return 0 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, SDLoc(N), VT); // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2)) if (N0.getOpcode() == ISD::SRL) { auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).uge(OpSizeInBits); }; if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) return DAG.getConstant(0, SDLoc(N), VT); auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).ult(OpSizeInBits); }; if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { SDLoc DL(N); EVT ShiftVT = N1.getValueType(); SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum); } } // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2))) if (N1C && N0.getOpcode() == ISD::TRUNCATE && N0.getOperand(0).getOpcode() == ISD::SRL) { if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) { uint64_t c1 = N001C->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); EVT InnerShiftVT = N0.getOperand(0).getValueType(); EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType(); uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); // This is only valid if the OpSizeInBits + c1 = size of inner shift. if (c1 + OpSizeInBits == InnerShiftSize) { SDLoc DL(N0); if (c1 + c2 >= InnerShiftSize) return DAG.getConstant(0, DL, VT); return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(ISD::SRL, DL, InnerShiftVT, N0.getOperand(0).getOperand(0), DAG.getConstant(c1 + c2, DL, ShiftCountVT))); } } } // fold (srl (shl x, c), c) -> (and x, cst2) if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && isConstantOrConstantVector(N1, /* NoOpaques */ true)) { SDLoc DL(N); SDValue Mask = DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); AddToWorklist(Mask.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); } // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask) if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { // Shifting in all undef bits? EVT SmallVT = N0.getOperand(0).getValueType(); unsigned BitSize = SmallVT.getScalarSizeInBits(); if (N1C->getZExtValue() >= BitSize) return DAG.getUNDEF(VT); if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) { uint64_t ShiftAmt = N1C->getZExtValue(); SDLoc DL0(N0); SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT, N0.getOperand(0), DAG.getConstant(ShiftAmt, DL0, getShiftAmountTy(SmallVT))); AddToWorklist(SmallShift.getNode()); APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt); SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift), DAG.getConstant(Mask, DL, VT)); } } // fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign // bit, which is unmodified by sra. if (N1C && N1C->getZExtValue() + 1 == OpSizeInBits) { if (N0.getOpcode() == ISD::SRA) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1); } // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). if (N1C && N0.getOpcode() == ISD::CTLZ && N1C->getAPIntValue() == Log2_32(OpSizeInBits)) { KnownBits Known; DAG.computeKnownBits(N0.getOperand(0), Known); // If any of the input bits are KnownOne, then the input couldn't be all // zeros, thus the result of the srl will always be zero. if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT); // If all of the bits input the to ctlz node are known to be zero, then // the result of the ctlz is "32" and the result of the shift is one. APInt UnknownBits = ~Known.Zero; if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT); // Otherwise, check to see if there is exactly one bit input to the ctlz. if (UnknownBits.isPowerOf2()) { // Okay, we know that only that the single bit specified by UnknownBits // could be set on input to the CTLZ node. If this bit is set, the SRL // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair // to an SRL/XOR pair, which is likely to simplify more. unsigned ShAmt = UnknownBits.countTrailingZeros(); SDValue Op = N0.getOperand(0); if (ShAmt) { SDLoc DL(N0); Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getConstant(ShAmt, DL, getShiftAmountTy(Op.getValueType()))); AddToWorklist(Op.getNode()); } SDLoc DL(N); return DAG.getNode(ISD::XOR, DL, VT, Op, DAG.getConstant(1, DL, VT)); } } // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); } // fold operands of srl based on knowledge that the low bits are not // demanded. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); if (N1C && !N1C->isOpaque()) if (SDValue NewSRL = visitShiftByConstant(N, N1C)) return NewSRL; // Attempt to convert a srl of a load into a narrower zero-extending load. if (SDValue NarrowLoad = ReduceLoadWidth(N)) return NarrowLoad; // Here is a common situation. We want to optimize: // // %a = ... // %b = and i32 %a, 2 // %c = srl i32 %b, 1 // brcond i32 %c ... // // into // // %a = ... // %b = and %a, 2 // %c = setcc eq %b, 0 // brcond %c ... // // However when after the source operand of SRL is optimized into AND, the SRL // itself may not be optimized further. Look for it and add the BRCOND into // the worklist. if (N->hasOneUse()) { SDNode *Use = *N->use_begin(); if (Use->getOpcode() == ISD::BRCOND) AddToWorklist(Use); else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) { // Also look pass the truncate. Use = *Use->use_begin(); if (Use->getOpcode() == ISD::BRCOND) AddToWorklist(Use); } } return SDValue(); } SDValue DAGCombiner::visitABS(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (abs c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); // fold (abs (abs x)) -> (abs x) if (N0.getOpcode() == ISD::ABS) return N0; // fold (abs x) -> x iff not-negative if (DAG.SignBitIsZero(N0)) return N0; return SDValue(); } SDValue DAGCombiner::visitBSWAP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (bswap c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0); // fold (bswap (bswap x)) -> x if (N0.getOpcode() == ISD::BSWAP) return N0->getOperand(0); return SDValue(); } SDValue DAGCombiner::visitBITREVERSE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (bitreverse c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0); // fold (bitreverse (bitreverse x)) -> x if (N0.getOpcode() == ISD::BITREVERSE) return N0.getOperand(0); return SDValue(); } SDValue DAGCombiner::visitCTLZ(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ctlz c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ctlz_zero_undef c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTTZ(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (cttz c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (cttz_zero_undef c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTPOP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ctpop c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0); return SDValue(); } /// \brief Generate Min/Max node static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode CC, const TargetLowering &TLI, SelectionDAG &DAG) { if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) return SDValue(); switch (CC) { case ISD::SETOLT: case ISD::SETOLE: case ISD::SETLT: case ISD::SETLE: case ISD::SETULT: case ISD::SETULE: { unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM; if (TLI.isOperationLegal(Opcode, VT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); return SDValue(); } case ISD::SETOGT: case ISD::SETOGE: case ISD::SETGT: case ISD::SETGE: case ISD::SETUGT: case ISD::SETUGE: { unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM; if (TLI.isOperationLegal(Opcode, VT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); return SDValue(); } default: return SDValue(); } } SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { SDValue Cond = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); EVT VT = N->getValueType(0); EVT CondVT = Cond.getValueType(); SDLoc DL(N); if (!VT.isInteger()) return SDValue(); auto *C1 = dyn_cast(N1); auto *C2 = dyn_cast(N2); if (!C1 || !C2) return SDValue(); // Only do this before legalization to avoid conflicting with target-specific // transforms in the other direction (create a select from a zext/sext). There // is also a target-independent combine here in DAGCombiner in the other // direction for (select Cond, -1, 0) when the condition is not i1. if (CondVT == MVT::i1 && !LegalOperations) { if (C1->isNullValue() && C2->isOne()) { // select Cond, 0, 1 --> zext (!Cond) SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); if (VT != MVT::i1) NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond); return NotCond; } if (C1->isNullValue() && C2->isAllOnesValue()) { // select Cond, 0, -1 --> sext (!Cond) SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); if (VT != MVT::i1) NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond); return NotCond; } if (C1->isOne() && C2->isNullValue()) { // select Cond, 1, 0 --> zext (Cond) if (VT != MVT::i1) Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); return Cond; } if (C1->isAllOnesValue() && C2->isNullValue()) { // select Cond, -1, 0 --> sext (Cond) if (VT != MVT::i1) Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); return Cond; } // For any constants that differ by 1, we can transform the select into an // extend and add. Use a target hook because some targets may prefer to // transform in the other direction. if (TLI.convertSelectOfConstantsToMath(VT)) { if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) { // select Cond, C1, C1-1 --> add (zext Cond), C1-1 if (VT != MVT::i1) Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); } if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) { // select Cond, C1, C1+1 --> add (sext Cond), C1+1 if (VT != MVT::i1) Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); } } return SDValue(); } // fold (select Cond, 0, 1) -> (xor Cond, 1) // We can't do this reliably if integer based booleans have different contents // to floating point based booleans. This is because we can't tell whether we // have an integer-based boolean or a floating-point-based boolean unless we // can find the SETCC that produced it and inspect its operands. This is // fairly easy if C is the SETCC node, but it can potentially be // undiscoverable (or not reasonably discoverable). For example, it could be // in another basic block or it could require searching a complicated // expression. if (CondVT.isInteger() && TLI.getBooleanContents(false, true) == TargetLowering::ZeroOrOneBooleanContent && TLI.getBooleanContents(false, false) == TargetLowering::ZeroOrOneBooleanContent && C1->isNullValue() && C2->isOne()) { SDValue NotCond = DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT)); if (VT.bitsEq(CondVT)) return NotCond; return DAG.getZExtOrTrunc(NotCond, DL, VT); } return SDValue(); } SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); EVT VT = N->getValueType(0); EVT VT0 = N0.getValueType(); SDLoc DL(N); // fold (select C, X, X) -> X if (N1 == N2) return N1; if (const ConstantSDNode *N0C = dyn_cast(N0)) { // fold (select true, X, Y) -> X // fold (select false, X, Y) -> Y return !N0C->isNullValue() ? N1 : N2; } // fold (select X, X, Y) -> (or X, Y) // fold (select X, 1, Y) -> (or C, Y) if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1))) return DAG.getNode(ISD::OR, DL, VT, N0, N2); if (SDValue V = foldSelectOfConstants(N)) return V; // fold (select C, 0, X) -> (and (not C), X) if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); AddToWorklist(NOTNode.getNode()); return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2); } // fold (select C, X, 1) -> (or (not C), X) if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); AddToWorklist(NOTNode.getNode()); return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1); } // fold (select X, Y, X) -> (and X, Y) // fold (select X, Y, 0) -> (and X, Y) if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2))) return DAG.getNode(ISD::AND, DL, VT, N0, N1); // If we can fold this based on the true/false value, do so. if (SimplifySelectOps(N, N1, N2)) return SDValue(N, 0); // Don't revisit N. if (VT0 == MVT::i1) { // The code in this block deals with the following 2 equivalences: // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y) // The target can specify its preferred form with the // shouldNormalizeToSelectSequence() callback. However we always transform // to the right anyway if we find the inner select exists in the DAG anyway // and we always transform to the left side if we know that we can further // optimize the combination of the conditions. bool normalizeToSequence = TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); // select (and Cond0, Cond1), X, Y // -> select Cond0, (select Cond1, X, Y), Y if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { SDValue Cond0 = N0->getOperand(0); SDValue Cond1 = N0->getOperand(1); SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2); if (normalizeToSequence || !InnerSelect.use_empty()) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, InnerSelect, N2); } // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y) if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { SDValue Cond0 = N0->getOperand(0); SDValue Cond1 = N0->getOperand(1); SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2); if (normalizeToSequence || !InnerSelect.use_empty()) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1, InnerSelect); } // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) { SDValue N1_0 = N1->getOperand(0); SDValue N1_1 = N1->getOperand(1); SDValue N1_2 = N1->getOperand(2); if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { // Create the actual and node if we can generate good code for it. if (!normalizeToSequence) { SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, N2); } // Otherwise see if we can optimize the "and" to a better pattern. if (SDValue Combined = visitANDLike(N0, N1_0, N)) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1, N2); } } // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) { SDValue N2_0 = N2->getOperand(0); SDValue N2_1 = N2->getOperand(1); SDValue N2_2 = N2->getOperand(2); if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { // Create the actual or node if we can generate good code for it. if (!normalizeToSequence) { SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, N2_2); } // Otherwise see if we can optimize to a better pattern. if (SDValue Combined = visitORLike(N0, N2_0, N)) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1, N2_2); } } } // select (xor Cond, 1), X, Y -> select Cond, Y, X if (VT0 == MVT::i1) { if (N0->getOpcode() == ISD::XOR) { if (auto *C = dyn_cast(N0->getOperand(1))) { SDValue Cond0 = N0->getOperand(0); if (C->isOne()) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N2, N1); } } } // fold selects based on a setcc into other things, such as min/max/abs if (N0.getOpcode() == ISD::SETCC) { // select x, y (fcmp lt x, y) -> fminnum x, y // select x, y (fcmp gt x, y) -> fmaxnum x, y // // This is OK if we don't care about what happens if either operand is a // NaN. // // FIXME: Instead of testing for UnsafeFPMath, this should be checking for // no signed zeros as well as no nans. const TargetOptions &Options = DAG.getTarget().Options; if (Options.UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() && DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) { ISD::CondCode CC = cast(N0.getOperand(2))->get(); if (SDValue FMinMax = combineMinNumMaxNum( DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG)) return FMinMax; } if ((!LegalOperations && TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) || TLI.isOperationLegal(ISD::SELECT_CC, VT)) return DAG.getNode(ISD::SELECT_CC, DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, N0.getOperand(2)); return SimplifySelect(DL, N0, N1, N2); } return SDValue(); } static std::pair SplitVSETCC(const SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); // Split the inputs. SDValue Lo, Hi, LL, LH, RL, RH; std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0); std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2)); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2)); return std::make_pair(Lo, Hi); } // This function assumes all the vselect's arguments are CONCAT_VECTOR // nodes and that the condition is a BV of ConstantSDNodes (or undefs). static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = N->getValueType(0); int NumElems = VT.getVectorNumElements(); assert(LHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getOpcode() == ISD::CONCAT_VECTORS && Cond.getOpcode() == ISD::BUILD_VECTOR); // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about // binary ones here. if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2) return SDValue(); // We're sure we have an even number of elements due to the // concat_vectors we have as arguments to vselect. // Skip BV elements until we find one that's not an UNDEF // After we find an UNDEF element, keep looping until we get to half the // length of the BV and see if all the non-undef nodes are the same. ConstantSDNode *BottomHalf = nullptr; for (int i = 0; i < NumElems / 2; ++i) { if (Cond->getOperand(i)->isUndef()) continue; if (BottomHalf == nullptr) BottomHalf = cast(Cond.getOperand(i)); else if (Cond->getOperand(i).getNode() != BottomHalf) return SDValue(); } // Do the same for the second half of the BuildVector ConstantSDNode *TopHalf = nullptr; for (int i = NumElems / 2; i < NumElems; ++i) { if (Cond->getOperand(i)->isUndef()) continue; if (TopHalf == nullptr) TopHalf = cast(Cond.getOperand(i)); else if (Cond->getOperand(i).getNode() != TopHalf) return SDValue(); } assert(TopHalf && BottomHalf && "One half of the selector was all UNDEFs and the other was all the " "same value. This should have been addressed before this function."); return DAG.getNode( ISD::CONCAT_VECTORS, DL, VT, BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0), TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1)); } SDValue DAGCombiner::visitMSCATTER(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedScatterSDNode *MSC = cast(N); SDValue Mask = MSC->getMask(); SDValue Data = MSC->getValue(); SDLoc DL(N); // If the MSCATTER data type requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() != ISD::SETCC) return SDValue(); // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0)); SDValue Chain = MSC->getChain(); EVT MemoryVT = MSC->getMemoryVT(); unsigned Alignment = MSC->getOriginalAlignment(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); SDValue BasePtr = MSC->getBasePtr(); SDValue IndexLo, IndexHi; std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MSC->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, MSC->getAAInfo(), MSC->getRanges()); SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo }; Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(), DL, OpsLo, MMO); SDValue OpsHi[] = {Chain, DataHi, MaskHi, BasePtr, IndexHi}; Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(), DL, OpsHi, MMO); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } SDValue DAGCombiner::visitMSTORE(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedStoreSDNode *MST = dyn_cast(N); SDValue Mask = MST->getMask(); SDValue Data = MST->getValue(); EVT VT = Data.getValueType(); SDLoc DL(N); // If the MSTORE data type requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() == ISD::SETCC) { // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), VT) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); SDValue Chain = MST->getChain(); SDValue Ptr = MST->getBasePtr(); EVT MemoryVT = MST->getMemoryVT(); unsigned Alignment = MST->getOriginalAlignment(); // if Alignment is equal to the vector size, // take the half of it for the second part unsigned SecondHalfAlignment = (Alignment == VT.getSizeInBits() / 8) ? Alignment / 2 : Alignment; EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MST->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, MST->getAAInfo(), MST->getRanges()); Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO, MST->isTruncatingStore(), MST->isCompressingStore()); Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, MST->isCompressingStore()); MMO = DAG.getMachineFunction(). getMachineMemOperand(MST->getPointerInfo(), MachineMemOperand::MOStore, HiMemVT.getStoreSize(), SecondHalfAlignment, MST->getAAInfo(), MST->getRanges()); Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO, MST->isTruncatingStore(), MST->isCompressingStore()); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } return SDValue(); } SDValue DAGCombiner::visitMGATHER(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedGatherSDNode *MGT = cast(N); SDValue Mask = MGT->getMask(); SDLoc DL(N); // If the MGATHER result requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() != ISD::SETCC) return SDValue(); EVT VT = N->getValueType(0); // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), VT) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); SDValue Src0 = MGT->getValue(); SDValue Src0Lo, Src0Hi; std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); SDValue Chain = MGT->getChain(); EVT MemoryVT = MGT->getMemoryVT(); unsigned Alignment = MGT->getOriginalAlignment(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue BasePtr = MGT->getBasePtr(); SDValue Index = MGT->getIndex(); SDValue IndexLo, IndexHi; std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MGT->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), Alignment, MGT->getAAInfo(), MGT->getRanges()); SDValue OpsLo[] = { Chain, Src0Lo, MaskLo, BasePtr, IndexLo }; Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo, MMO); SDValue OpsHi[] = {Chain, Src0Hi, MaskHi, BasePtr, IndexHi}; Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi, MMO); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); // Build a factor node to remember that this load is independent of the // other one. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), Hi.getValue(1)); // Legalized the chain result - switch anything that used the old chain to // use the new one. DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain); SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); SDValue RetOps[] = { GatherRes, Chain }; return DAG.getMergeValues(RetOps, DL); } SDValue DAGCombiner::visitMLOAD(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedLoadSDNode *MLD = dyn_cast(N); SDValue Mask = MLD->getMask(); SDLoc DL(N); // If the MLOAD result requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() == ISD::SETCC) { EVT VT = N->getValueType(0); // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), VT) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); SDValue Src0 = MLD->getSrc0(); SDValue Src0Lo, Src0Hi; std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0)); SDValue Chain = MLD->getChain(); SDValue Ptr = MLD->getBasePtr(); EVT MemoryVT = MLD->getMemoryVT(); unsigned Alignment = MLD->getOriginalAlignment(); // if Alignment is equal to the vector size, // take the half of it for the second part unsigned SecondHalfAlignment = (Alignment == MLD->getValueType(0).getSizeInBits()/8) ? Alignment/2 : Alignment; EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), Alignment, MLD->getAAInfo(), MLD->getRanges()); Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad()); Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, MLD->isExpandingLoad()); MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges()); Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad()); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); // Build a factor node to remember that this load is independent of the // other one. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), Hi.getValue(1)); // Legalized the chain result - switch anything that used the old chain to // use the new one. DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain); SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); SDValue RetOps[] = { LoadRes, Chain }; return DAG.getMergeValues(RetOps, DL); } return SDValue(); } /// A vector select of 2 constant vectors can be simplified to math/logic to /// avoid a variable select instruction and possibly avoid constant loads. SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) { SDValue Cond = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); EVT VT = N->getValueType(0); if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 || !TLI.convertSelectOfConstantsToMath(VT) || !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) || !ISD::isBuildVectorOfConstantSDNodes(N2.getNode())) return SDValue(); // Check if we can use the condition value to increment/decrement a single // constant value. This simplifies a select to an add and removes a constant // load/materialization from the general case. bool AllAddOne = true; bool AllSubOne = true; unsigned Elts = VT.getVectorNumElements(); for (unsigned i = 0; i != Elts; ++i) { SDValue N1Elt = N1.getOperand(i); SDValue N2Elt = N2.getOperand(i); if (N1Elt.isUndef() || N2Elt.isUndef()) continue; const APInt &C1 = cast(N1Elt)->getAPIntValue(); const APInt &C2 = cast(N2Elt)->getAPIntValue(); if (C1 != C2 + 1) AllAddOne = false; if (C1 != C2 - 1) AllSubOne = false; } // Further simplifications for the extra-special cases where the constants are // all 0 or all -1 should be implemented as folds of these patterns. SDLoc DL(N); if (AllAddOne || AllSubOne) { // vselect Cond, C+1, C --> add (zext Cond), C // vselect Cond, C-1, C --> add (sext Cond), C auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond); return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2); } // The general case for select-of-constants: // vselect Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2 // ...but that only makes sense if a vselect is slower than 2 logic ops, so // leave that to a machine-specific pass. return SDValue(); } SDValue DAGCombiner::visitVSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); SDLoc DL(N); // fold (vselect C, X, X) -> X if (N1 == N2) return N1; // Canonicalize integer abs. // vselect (setg[te] X, 0), X, -X -> // vselect (setgt X, -1), X, -X -> // vselect (setl[te] X, 0), -X, X -> // Y = sra (X, size(X)-1); xor (add (X, Y), Y) if (N0.getOpcode() == ISD::SETCC) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); ISD::CondCode CC = cast(N0.getOperand(2))->get(); bool isAbs = false; bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) || (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) && N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1)) isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode()); else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) && N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1)) isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); if (isAbs) { EVT VT = LHS.getValueType(); if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) return DAG.getNode(ISD::ABS, DL, VT, LHS); SDValue Shift = DAG.getNode( ISD::SRA, DL, VT, LHS, DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT)); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift); AddToWorklist(Shift.getNode()); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::XOR, DL, VT, Add, Shift); } } if (SimplifySelectOps(N, N1, N2)) return SDValue(N, 0); // Don't revisit N. // Fold (vselect (build_vector all_ones), N1, N2) -> N1 if (ISD::isBuildVectorAllOnes(N0.getNode())) return N1; // Fold (vselect (build_vector all_zeros), N1, N2) -> N2 if (ISD::isBuildVectorAllZeros(N0.getNode())) return N2; // The ConvertSelectToConcatVector function is assuming both the above // checks for (vselect (build_vector all{ones,zeros) ...) have been made // and addressed. if (N1.getOpcode() == ISD::CONCAT_VECTORS && N2.getOpcode() == ISD::CONCAT_VECTORS && ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { if (SDValue CV = ConvertSelectToConcatVector(N, DAG)) return CV; } if (SDValue V = foldVSelectOfConstants(N)) return V; return SDValue(); } SDValue DAGCombiner::visitSELECT_CC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); SDValue N3 = N->getOperand(3); SDValue N4 = N->getOperand(4); ISD::CondCode CC = cast(N4)->get(); // fold select_cc lhs, rhs, x, x, cc -> x if (N2 == N3) return N2; // Determine if the condition we're dealing with is constant if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, CC, SDLoc(N), false)) { AddToWorklist(SCC.getNode()); if (ConstantSDNode *SCCC = dyn_cast(SCC.getNode())) { if (!SCCC->isNullValue()) return N2; // cond always true -> true val else return N3; // cond always false -> false val } else if (SCC->isUndef()) { // When the condition is UNDEF, just return the first operand. This is // coherent the DAG creation, no setcc node is created in this case return N2; } else if (SCC.getOpcode() == ISD::SETCC) { // Fold to a simpler select_cc return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0), SCC.getOperand(1), N2, N3, SCC.getOperand(2)); } } // If we can fold this based on the true/false value, do so. if (SimplifySelectOps(N, N2, N3)) return SDValue(N, 0); // Don't revisit N. // fold select_cc into other things, such as min/max/abs return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC); } SDValue DAGCombiner::visitSETCC(SDNode *N) { return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1), cast(N->getOperand(2))->get(), SDLoc(N)); } SDValue DAGCombiner::visitSETCCE(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Carry = N->getOperand(2); SDValue Cond = N->getOperand(3); // If Carry is false, fold to a regular SETCC. if (Carry.getOpcode() == ISD::CARRY_FALSE) return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); return SDValue(); } SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Carry = N->getOperand(2); SDValue Cond = N->getOperand(3); // If Carry is false, fold to a regular SETCC. if (isNullConstant(Carry)) return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); return SDValue(); } /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or /// a build_vector of constants. /// This function is called by the DAGCombiner when visiting sext/zext/aext /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). /// Vector extends are not folded if operations are legal; this is to /// avoid introducing illegal build_vector dag nodes. static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, SelectionDAG &DAG, bool LegalTypes, bool LegalOperations) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) && "Expected EXTEND dag node in input!"); // fold (sext c1) -> c1 // fold (zext c1) -> c1 // fold (aext c1) -> c1 if (isa(N0)) return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode(); // fold (sext (build_vector AllConstants) -> (build_vector AllConstants) // fold (zext (build_vector AllConstants) -> (build_vector AllConstants) // fold (aext (build_vector AllConstants) -> (build_vector AllConstants) EVT SVT = VT.getScalarType(); if (!(VT.isVector() && (!LegalTypes || (!LegalOperations && TLI.isTypeLegal(SVT))) && ISD::isBuildVectorOfConstantSDNodes(N0.getNode()))) return nullptr; // We can fold this node into a build_vector. unsigned VTBits = SVT.getSizeInBits(); unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits(); SmallVector Elts; unsigned NumElts = VT.getVectorNumElements(); SDLoc DL(N); for (unsigned i=0; i != NumElts; ++i) { SDValue Op = N0->getOperand(i); if (Op->isUndef()) { Elts.push_back(DAG.getUNDEF(SVT)); continue; } SDLoc DL(Op); // Get the constant value and if needed trunc it to the size of the type. // Nodes like build_vector might have constants wider than the scalar type. APInt C = cast(Op)->getAPIntValue().zextOrTrunc(EVTBits); if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG) Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT)); else Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT)); } return DAG.getBuildVector(VT, DL, Elts).getNode(); } // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this: // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))" // transformation. Returns true if extension are possible and the above // mentioned transformation is profitable. static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0, unsigned ExtOpc, SmallVectorImpl &ExtendNodes, const TargetLowering &TLI) { bool HasCopyToRegUses = false; bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType()); for (SDNode::use_iterator UI = N0.getNode()->use_begin(), UE = N0.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User == N) continue; if (UI.getUse().getResNo() != N0.getResNo()) continue; // FIXME: Only extend SETCC N, N and SETCC N, c for now. if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) { ISD::CondCode CC = cast(User->getOperand(2))->get(); if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC)) // Sign bits will be lost after a zext. return false; bool Add = false; for (unsigned i = 0; i != 2; ++i) { SDValue UseOp = User->getOperand(i); if (UseOp == N0) continue; if (!isa(UseOp)) return false; Add = true; } if (Add) ExtendNodes.push_back(User); continue; } // If truncates aren't free and there are users we can't // extend, it isn't worthwhile. if (!isTruncFree) return false; // Remember if this value is live-out. if (User->getOpcode() == ISD::CopyToReg) HasCopyToRegUses = true; } if (HasCopyToRegUses) { bool BothLiveOut = false; for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { SDUse &Use = UI.getUse(); if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) { BothLiveOut = true; break; } } if (BothLiveOut) // Both unextended and extended values are live out. There had better be // a good reason for the transformation. return ExtendNodes.size(); } return true; } void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl &SetCCs, SDValue Trunc, SDValue ExtLoad, const SDLoc &DL, ISD::NodeType ExtType) { // Extend SetCC uses if necessary. for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) { SDNode *SetCC = SetCCs[i]; SmallVector Ops; for (unsigned j = 0; j != 2; ++j) { SDValue SOp = SetCC->getOperand(j); if (SOp == Trunc) Ops.push_back(ExtLoad); else Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp)); } Ops.push_back(SetCC->getOperand(2)); CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops)); } } // FIXME: Bring more similar combines here, common to sext/zext (maybe aext?). SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue N0 = N->getOperand(0); EVT DstVT = N->getValueType(0); EVT SrcVT = N0.getValueType(); assert((N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) && "Unexpected node type (not an extend)!"); // fold (sext (load x)) to multiple smaller sextloads; same for zext. // For example, on a target with legal v4i32, but illegal v8i32, turn: // (v8i32 (sext (v8i16 (load x)))) // into: // (v8i32 (concat_vectors (v4i32 (sextload x)), // (v4i32 (sextload (x + 16))))) // Where uses of the original load, i.e.: // (v8i16 (load x)) // are replaced with: // (v8i16 (truncate // (v8i32 (concat_vectors (v4i32 (sextload x)), // (v4i32 (sextload (x + 16))))))) // // This combine is only applicable to illegal, but splittable, vectors. // All legal types, and illegal non-vector types, are handled elsewhere. // This combine is controlled by TargetLowering::isVectorLoadExtDesirable. // if (N0->getOpcode() != ISD::LOAD) return SDValue(); LoadSDNode *LN0 = cast(N0); if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) || !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() || !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0))) return SDValue(); SmallVector SetCCs; if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI)) return SDValue(); ISD::LoadExtType ExtType = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; // Try to split the vector types to get down to legal types. EVT SplitSrcVT = SrcVT; EVT SplitDstVT = DstVT; while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && SplitSrcVT.getVectorNumElements() > 1) { SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; } if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) return SDValue(); SDLoc DL(N); const unsigned NumSplits = DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); const unsigned Stride = SplitSrcVT.getStoreSize(); SmallVector Loads; SmallVector Chains; SDValue BasePtr = LN0->getBasePtr(); for (unsigned Idx = 0; Idx < NumSplits; Idx++) { const unsigned Offset = Idx * Stride; const unsigned Align = MinAlign(LN0->getAlignment(), Offset); SDValue SplitLoad = DAG.getExtLoad( ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr, LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, DAG.getConstant(Stride, DL, BasePtr.getValueType())); Loads.push_back(SplitLoad.getValue(0)); Chains.push_back(SplitLoad.getValue(1)); } SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); // Simplify TF. AddToWorklist(NewChain.getNode()); CombineTo(N, NewValue); // Replace uses of the original load (before extension) // with a truncate of the concatenated sextloaded vectors. SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue); CombineTo(N0.getNode(), Trunc, NewChain); ExtendSetCCUses(SetCCs, Trunc, NewValue, DL, (ISD::NodeType)N->getOpcode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! } /// If we're narrowing or widening the result of a vector select and the final /// size is the same size as a setcc (compare) feeding the select, then try to /// apply the cast operation to the select's operands because matching vector /// sizes for a select condition and other operands should be more efficient. SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) { unsigned CastOpcode = Cast->getOpcode(); assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND || CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND || CastOpcode == ISD::FP_ROUND) && "Unexpected opcode for vector select narrowing/widening"); // We only do this transform before legal ops because the pattern may be // obfuscated by target-specific operations after legalization. Do not create // an illegal select op, however, because that may be difficult to lower. EVT VT = Cast->getValueType(0); if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) return SDValue(); SDValue VSel = Cast->getOperand(0); if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() || VSel.getOperand(0).getOpcode() != ISD::SETCC) return SDValue(); // Does the setcc have the same vector size as the casted select? SDValue SetCC = VSel.getOperand(0); EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType()); if (SetCCVT.getSizeInBits() != VT.getSizeInBits()) return SDValue(); // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B) SDValue A = VSel.getOperand(1); SDValue B = VSel.getOperand(2); SDValue CastA, CastB; SDLoc DL(Cast); if (CastOpcode == ISD::FP_ROUND) { // FP_ROUND (fptrunc) has an extra flag operand to pass along. CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1)); CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1)); } else { CastA = DAG.getNode(CastOpcode, DL, VT, A); CastB = DAG.getNode(CastOpcode, DL, VT, B); } return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB); } SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); SDLoc DL(N); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); // fold (sext (sext x)) -> (sext x) // fold (sext (aext x)) -> (sext x) if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); if (N0.getOpcode() == ISD::TRUNCATE) { // fold (sext (truncate (load x))) -> (sext (smaller load x)) // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n))) if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { SDNode *oye = N0.getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } // See if the value being truncated is already sign extended. If so, just // eliminate the trunc/sext pair. SDValue Op = N0.getOperand(0); unsigned OpBits = Op.getScalarValueSizeInBits(); unsigned MidBits = N0.getScalarValueSizeInBits(); unsigned DestBits = VT.getScalarSizeInBits(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op); if (OpBits == DestBits) { // Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign // bits, it is already ready. if (NumSignBits > DestBits-MidBits) return Op; } else if (OpBits < DestBits) { // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign // bits, just sext from i32. if (NumSignBits > OpBits-MidBits) return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op); } else { // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign // bits, just truncate to i32. if (NumSignBits > OpBits-MidBits) return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); } // fold (sext (truncate x)) -> (sextinreg x). if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, N0.getValueType())) { if (OpBits < DestBits) Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op); else if (OpBits > DestBits) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(N0.getValueType())); } } // fold (sext (load x)) -> (sext (truncate (sextload x))) // Only generate vector extloads when 1) they're legal, and 2) they are // deemed desirable by the target. if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && ((!LegalOperations && !VT.isVector() && !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI); if (VT.isVector()) DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); CombineTo(N, ExtLoad); if (NoReplaceTrunc) DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); else CombineTo(LN0, Trunc, ExtLoad.getValue(1)); return SDValue(N, 0); } } // fold (sext (load x)) to multiple smaller sextloads. // Only on illegal but splittable vectors. if (SDValue ExtLoad = CombineExtLoad(N)) return ExtLoad; // fold (sext (sextload x)) -> (sext (truncate (sextload x))) // fold (sext ( extload x)) -> (sext (truncate (sextload x))) if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); if ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (sext (and/or/xor (load x), cst)) -> // (and/or/xor (sextload x), (sext cst)) if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::XOR) && isa(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN0 = cast(N0.getOperand(0)); if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND, SetCCs, TLI); if (DoXform) { SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), LN0->getMemoryVT(), LN0->getMemOperand()); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.sext(VT.getSizeInBits()); SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, ExtLoad, DAG.getConstant(Mask, DL, VT)); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); bool NoReplaceTruncAnd = !N0.hasOneUse(); bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); CombineTo(N, And); // If N0 has multiple uses, change other uses as well. if (NoReplaceTruncAnd) { SDValue TruncAnd = DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); CombineTo(N0.getNode(), TruncAnd); } if (NoReplaceTrunc) DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); else CombineTo(LN0, Trunc, ExtLoad.getValue(1)); return SDValue(N,0); // Return N so it doesn't get rechecked! } } } if (N0.getOpcode() == ISD::SETCC) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); ISD::CondCode CC = cast(N0.getOperand(2))->get(); EVT N00VT = N0.getOperand(0).getValueType(); // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. if (VT.isVector() && !LegalOperations && TLI.getBooleanContents(N00VT) == TargetLowering::ZeroOrNegativeOneBooleanContent) { // On some architectures (such as SSE/NEON/etc) the SETCC result type is // of the same size as the compared operands. Only optimize sext(setcc()) // if this is the case. EVT SVT = getSetCCResultType(N00VT); // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result // for that matter). Check to see that they are the same size. If so, // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == SVT.getSizeInBits()) return DAG.getSetCC(DL, VT, N00, N01, CC); // If the desired elements are smaller or larger than the source // elements, we can use a matching integer vector type and then // truncate/sign extend. EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); if (SVT == MatchingVecType) { SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); return DAG.getSExtOrTrunc(VsetCC, DL, VT); } } // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) // Here, T can be 1 or -1, depending on the type of the setcc and // getBooleanContents(). unsigned SetCCWidth = N0.getScalarValueSizeInBits(); // To determine the "true" side of the select, we need to know the high bit // of the value returned by the setcc if it evaluates to true. // If the type of the setcc is i1, then the true case of the select is just // sext(i1 1), that is, -1. // If the type of the setcc is larger (say, i8) then the value of the high // bit depends on getBooleanContents(), so ask TLI for a real "true" value // of the appropriate width. SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT) : TLI.getConstTrueVal(DAG, VT, DL); SDValue Zero = DAG.getConstant(0, DL, VT); if (SDValue SCC = SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true)) return SCC; if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) { EVT SetCCVT = getSetCCResultType(N00VT); // Don't do this transform for i1 because there's a select transform // that would reverse it. // TODO: We should not do this transform at all without a target hook // because a sext is likely cheaper than a select? if (SetCCVT.getScalarSizeInBits() != 1 && (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) { SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC); return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero); } } } // fold (sext x) -> (zext x) if the sign bit is known zero. if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0); if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } // isTruncateOf - If N is a truncate of some other value, return true, record // the value being truncated in Op and which of Op's bits are zero/one in Known. // This function computes KnownBits to avoid a duplicated call to // computeKnownBits in the caller. static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, KnownBits &Known) { if (N->getOpcode() == ISD::TRUNCATE) { Op = N->getOperand(0); DAG.computeKnownBits(Op, Known); return true; } if (N->getOpcode() != ISD::SETCC || N->getValueType(0) != MVT::i1 || cast(N->getOperand(2))->get() != ISD::SETNE) return false; SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); assert(Op0.getValueType() == Op1.getValueType()); if (isNullConstant(Op0)) Op = Op1; else if (isNullConstant(Op1)) Op = Op0; else return false; DAG.computeKnownBits(Op, Known); if (!(Known.Zero | 1).isAllOnesValue()) return false; return true; } SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); // fold (zext (zext x)) -> (zext x) // fold (zext (aext x)) -> (zext x) if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0)); // fold (zext (truncate x)) -> (zext x) or // (zext (truncate x)) -> (truncate x) // This is valid when the truncated bits of x are already zero. // FIXME: We should extend this to work for vectors too. SDValue Op; KnownBits Known; if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) { APInt TruncatedBits = (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ? APInt(Op.getValueSizeInBits(), 0) : APInt::getBitsSet(Op.getValueSizeInBits(), N0.getValueSizeInBits(), std::min(Op.getValueSizeInBits(), VT.getSizeInBits())); if (TruncatedBits.isSubsetOf(Known.Zero)) return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); } // fold (zext (truncate x)) -> (and x, mask) if (N0.getOpcode() == ISD::TRUNCATE) { // fold (zext (truncate (load x))) -> (zext (smaller load x)) // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { SDNode *oye = N0.getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } EVT SrcVT = N0.getOperand(0).getValueType(); EVT MinVT = N0.getValueType(); // Try to mask before the extension to avoid having to generate a larger mask, // possibly over several sub-vectors. if (SrcVT.bitsLT(VT)) { if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) && TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) { SDValue Op = N0.getOperand(0); Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); AddToWorklist(Op.getNode()); return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); } } if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); AddToWorklist(Op.getNode()); SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); // We may safely transfer the debug info describing the truncate node over // to the equivalent and operation. DAG.transferDbgValues(N0, And); return And; } } // Fold (zext (and (trunc x), cst)) -> (and x, cst), // if either of the casts is not free. if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::TRUNCATE && N0.getOperand(1).getOpcode() == ISD::Constant && (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), N0.getValueType()) || !TLI.isZExtFree(N0.getValueType(), VT))) { SDValue X = N0.getOperand(0).getOperand(0); X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(Mask, DL, VT)); } // fold (zext (load x)) -> (zext (truncate (zextload x))) // Only generate vector extloads when 1) they're legal, and 2) they are // deemed desirable by the target. if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && ((!LegalOperations && !VT.isVector() && !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI); if (VT.isVector()) DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); CombineTo(N, ExtLoad); if (NoReplaceTrunc) DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); else CombineTo(LN0, Trunc, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (zext (load x)) to multiple smaller zextloads. // Only on illegal but splittable vectors. if (SDValue ExtLoad = CombineExtLoad(N)) return ExtLoad; // fold (zext (and/or/xor (load x), cst)) -> // (and/or/xor (zextload x), (zext cst)) // Unless (and (load x) cst) will match as a zextload already and has // additional users. if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::XOR) && isa(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN0 = cast(N0.getOperand(0)); if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) { if (N0.getOpcode() == ISD::AND) { auto *AndC = cast(N0.getOperand(1)); EVT LoadResultTy = AndC->getValueType(0); EVT ExtVT; if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT)) DoXform = false; } if (DoXform) DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::ZERO_EXTEND, SetCCs, TLI); } if (DoXform) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), LN0->getMemoryVT(), LN0->getMemOperand()); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); SDLoc DL(N); SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, ExtLoad, DAG.getConstant(Mask, DL, VT)); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND); bool NoReplaceTruncAnd = !N0.hasOneUse(); bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); CombineTo(N, And); // If N0 has multiple uses, change other uses as well. if (NoReplaceTruncAnd) { SDValue TruncAnd = DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); CombineTo(N0.getNode(), TruncAnd); } if (NoReplaceTrunc) DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); else CombineTo(LN0, Trunc, ExtLoad.getValue(1)); return SDValue(N,0); // Return N so it doesn't get rechecked! } } } // fold (zext (zextload x)) -> (zext (truncate (zextload x))) // fold (zext ( extload x)) -> (zext (truncate (zextload x))) if ((ISD::isZEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); if ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } if (N0.getOpcode() == ISD::SETCC) { // Only do this before legalize for now. if (!LegalOperations && VT.isVector() && N0.getValueType().getVectorElementType() == MVT::i1) { EVT N00VT = N0.getOperand(0).getValueType(); if (getSetCCResultType(N00VT) == N0.getValueType()) return SDValue(); // We know that the # elements of the results is the same as the # // elements of the compare (and the # elements of the compare result for // that matter). Check to see that they are the same size. If so, we know // that the element size of the sext'd result matches the element size of // the compare operands. SDLoc DL(N); SDValue VecOnes = DAG.getConstant(1, DL, VT); if (VT.getSizeInBits() == N00VT.getSizeInBits()) { // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors. SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0), N0.getOperand(1), N0.getOperand(2)); return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes); } // If the desired elements are smaller or larger than the source // elements we can use a matching integer vector type and then // truncate/sign extend. EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); SDValue VsetCC = DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0), N0.getOperand(1), N0.getOperand(2)); return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT), VecOnes); } // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc SDLoc DL(N); if (SDValue SCC = SimplifySelectCC( DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT), cast(N0.getOperand(2))->get(), true)) return SCC; } // (zext (shl (zext x), cst)) -> (shl (zext x), cst) if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) && isa(N0.getOperand(1)) && N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse()) { SDValue ShAmt = N0.getOperand(1); unsigned ShAmtVal = cast(ShAmt)->getZExtValue(); if (N0.getOpcode() == ISD::SHL) { SDValue InnerZExt = N0.getOperand(0); // If the original shl may be shifting out bits, do not perform this // transformation. unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() - InnerZExt.getOperand(0).getValueSizeInBits(); if (ShAmtVal > KnownZeroBits) return SDValue(); } SDLoc DL(N); // Ensure that the shift amount is wide enough for the shifted value. if (VT.getSizeInBits() >= 256) ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt); return DAG.getNode(N0.getOpcode(), DL, VT, DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)), ShAmt); } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); // fold (aext (aext x)) -> (aext x) // fold (aext (zext x)) -> (zext x) // fold (aext (sext x)) -> (sext x) if (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND) return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); // fold (aext (truncate (load x))) -> (aext (smaller load x)) // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) if (N0.getOpcode() == ISD::TRUNCATE) { if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { SDNode *oye = N0.getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (aext (truncate x)) if (N0.getOpcode() == ISD::TRUNCATE) return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); // Fold (aext (and (trunc x), cst)) -> (and x, cst) // if the trunc is not free. if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::TRUNCATE && N0.getOperand(1).getOpcode() == ISD::Constant && !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), N0.getValueType())) { SDLoc DL(N); SDValue X = N0.getOperand(0).getOperand(0); X = DAG.getAnyExtOrTrunc(X, DL, VT); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(Mask, DL, VT)); } // fold (aext (load x)) -> (aext (truncate (extload x))) // None of the supported targets knows how to perform load and any_ext // on vectors in one instruction. We only perform this transformation on // scalars. if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && ISD::isUNINDEXEDLoad(N0.getNode()) && TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ANY_EXTEND); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceTrunc = N0.hasOneUse(); CombineTo(N, ExtLoad); if (NoReplaceTrunc) DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); else CombineTo(LN0, Trunc, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (aext (zextload x)) -> (aext (truncate (zextload x))) // fold (aext (sextload x)) -> (aext (truncate (sextload x))) // fold (aext ( extload x)) -> (aext (truncate (extload x))) if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); ISD::LoadExtType ExtType = LN0->getExtensionType(); EVT MemVT = LN0->getMemoryVT(); if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } if (N0.getOpcode() == ISD::SETCC) { // For vectors: // aext(setcc) -> vsetcc // aext(setcc) -> truncate(vsetcc) // aext(setcc) -> aext(vsetcc) // Only do this before legalize for now. if (VT.isVector() && !LegalOperations) { EVT N00VT = N0.getOperand(0).getValueType(); if (getSetCCResultType(N00VT) == N0.getValueType()) return SDValue(); // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result // for that matter). Check to see that they are the same size. If so, // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == N00VT.getSizeInBits()) return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); // If the desired elements are smaller or larger than the source // elements we can use a matching integer vector type and then // truncate/any extend else { EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT); } } // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc SDLoc DL(N); if (SDValue SCC = SimplifySelectCC( DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT), cast(N0.getOperand(2))->get(), true)) return SCC; } return SDValue(); } SDValue DAGCombiner::visitAssertExt(SDNode *N) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT AssertVT = cast(N1)->getVT(); // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt) if (N0.getOpcode() == Opcode && AssertVT == cast(N0.getOperand(1))->getVT()) return N0; if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && N0.getOperand(0).getOpcode() == Opcode) { // We have an assert, truncate, assert sandwich. Make one stronger assert // by asserting on the smallest asserted type to the larger source type. // This eliminates the later assert: // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN SDValue BigA = N0.getOperand(0); EVT BigA_AssertVT = cast(BigA.getOperand(1))->getVT(); assert(BigA_AssertVT.bitsLE(N0.getValueType()) && "Asserting zero/sign-extended bits to a type larger than the " "truncated destination does not provide information"); SDLoc DL(N); EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT; SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT); SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), BigA.getOperand(0), MinAssertVTVal); return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); } return SDValue(); } /// If the result of a wider load is shifted to right of N bits and then /// truncated to a narrower type and where N is a multiple of number of bits of /// the narrower type, transform it to a narrower load from address + N / num of /// bits of new type. Also narrow the load if the result is masked with an AND /// to effectively produce a smaller type. If the result is to be extended, also /// fold the extension to form a extending load. SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { unsigned Opc = N->getOpcode(); ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT ExtVT = VT; // This transformation isn't valid for vector loads. if (VT.isVector()) return SDValue(); // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then // extended to VT. if (Opc == ISD::SIGN_EXTEND_INREG) { ExtType = ISD::SEXTLOAD; ExtVT = cast(N->getOperand(1))->getVT(); } else if (Opc == ISD::SRL) { // Another special-case: SRL is basically zero-extending a narrower value, // or it maybe shifting a higher subword, half or byte into the lowest // bits. ExtType = ISD::ZEXTLOAD; N0 = SDValue(N, 0); auto *LN0 = dyn_cast(N0.getOperand(0)); auto *N01 = dyn_cast(N0.getOperand(1)); if (!N01 || !LN0) return SDValue(); uint64_t ShiftAmt = N01->getZExtValue(); uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits(); if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt) ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt); else ExtVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() - ShiftAmt); } else if (Opc == ISD::AND) { // An AND with a constant mask is the same as a truncate + zero-extend. auto AndC = dyn_cast(N->getOperand(1)); if (!AndC || !AndC->getAPIntValue().isMask()) return SDValue(); unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); ExtType = ISD::ZEXTLOAD; ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); } unsigned ShAmt = 0; if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { if (ConstantSDNode *N01 = dyn_cast(N0.getOperand(1))) { ShAmt = N01->getZExtValue(); unsigned EVTBits = ExtVT.getSizeInBits(); // Is the shift amount a multiple of size of VT? if ((ShAmt & (EVTBits-1)) == 0) { N0 = N0.getOperand(0); // Is the load width a multiple of size of VT? if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0) return SDValue(); } // At this point, we must have a load or else we can't do the transform. if (!isa(N0)) return SDValue(); // Because a SRL must be assumed to *need* to zero-extend the high bits // (as opposed to anyext the high bits), we can't combine the zextload // lowering of SRL and an sextload. if (cast(N0)->getExtensionType() == ISD::SEXTLOAD) return SDValue(); // If the shift amount is larger than the input type then we're not // accessing any of the loaded bytes. If the load was a zextload/extload // then the result of the shift+trunc is zero/undef (handled elsewhere). if (ShAmt >= cast(N0)->getMemoryVT().getSizeInBits()) return SDValue(); } } // If the load is shifted left (and the result isn't shifted back right), // we can fold the truncate through the shift. unsigned ShLeftAmt = 0; if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { if (ConstantSDNode *N01 = dyn_cast(N0.getOperand(1))) { ShLeftAmt = N01->getZExtValue(); N0 = N0.getOperand(0); } } // If we haven't found a load, we can't narrow it. if (!isa(N0)) return SDValue(); LoadSDNode *LN0 = cast(N0); if (!isLegalNarrowLoad(LN0, ExtType, ExtVT, ShAmt)) return SDValue(); // For big endian targets, we need to adjust the offset to the pointer to // load the correct bytes. if (DAG.getDataLayout().isBigEndian()) { unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits(); unsigned EVTStoreBits = ExtVT.getStoreSizeInBits(); ShAmt = LVTStoreBits - EVTStoreBits - ShAmt; } EVT PtrType = N0.getOperand(1).getValueType(); uint64_t PtrOff = ShAmt / 8; unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); SDLoc DL(LN0); // The original load itself didn't wrap, so an offset within it doesn't. SDNodeFlags Flags; Flags.setNoUnsignedWrap(true); SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, LN0->getBasePtr(), DAG.getConstant(PtrOff, DL, PtrType), Flags); AddToWorklist(NewPtr.getNode()); SDValue Load; if (ExtType == ISD::NON_EXTLOAD) Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); else Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(), NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT, NewAlign, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); // Replace the old load's chain with the new load's chain. WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); // Shift the result left, if we've swallowed a left shift. SDValue Result = Load; if (ShLeftAmt != 0) { EVT ShImmTy = getShiftAmountTy(Result.getValueType()); if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt)) ShImmTy = VT; // If the shift amount is as large as the result size (but, presumably, // no larger than the source) then the useful bits of the result are // zero; we can't simply return the shortened shift, because the result // of that operation is undefined. SDLoc DL(N0); if (ShLeftAmt >= VT.getSizeInBits()) Result = DAG.getConstant(0, DL, VT); else Result = DAG.getNode(ISD::SHL, DL, VT, Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy)); } // Return the new loaded value. return Result; } SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); EVT EVT = cast(N1)->getVT(); unsigned VTBits = VT.getScalarSizeInBits(); unsigned EVTBits = EVT.getScalarSizeInBits(); if (N0.isUndef()) return DAG.getUNDEF(VT); // fold (sext_in_reg c1) -> c1 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1); // If the input is already sign extended, just drop the extension. if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1) return N0; // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2 if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG && EVT.bitsLT(cast(N0.getOperand(1))->getVT())) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), N1); // fold (sext_in_reg (sext x)) -> (sext x) // fold (sext_in_reg (aext x)) -> (sext x) // if x is small enough. if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getScalarValueSizeInBits() <= EVTBits && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); } // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x) if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) { if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT); } // fold (sext_in_reg (zext x)) -> (sext x) // iff we are extending the source sign bit. if (N0.getOpcode() == ISD::ZERO_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getScalarValueSizeInBits() == EVTBits && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); } // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1))) return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType()); // fold operands of sext_in_reg based on knowledge that the top bits are not // demanded. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (sext_in_reg (load x)) -> (smaller sextload x) // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits)) if (SDValue NarrowLoad = ReduceLoadWidth(N)) return NarrowLoad; // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24) // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible. // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above. if (N0.getOpcode() == ISD::SRL) { if (ConstantSDNode *ShAmt = dyn_cast(N0.getOperand(1))) if (ShAmt->getZExtValue()+EVTBits <= VTBits) { // We can turn this into an SRA iff the input to the SRL is already sign // extended enough. unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0)); if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits) return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1)); } } // fold (sext_inreg (extload x)) -> (sextload x) // If sextload is not supported by target, we can only do the combine when // load has one use. Doing otherwise can block folding the extload with other // extends that the target does support. if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && EVT == cast(N0)->getMemoryVT() && ((!LegalOperations && !cast(N0)->isVolatile() && N0.hasOneUse()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), EVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); AddToWorklist(ExtLoad.getNode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse() && EVT == cast(N0)->getMemoryVT() && ((!LegalOperations && !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), EVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16)) if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) { if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), N0.getOperand(1), false)) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1); } return SDValue(); } SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.isUndef()) return DAG.getUNDEF(VT); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); return SDValue(); } SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.isUndef()) return DAG.getUNDEF(VT); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); return SDValue(); } SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); bool isLE = DAG.getDataLayout().isLittleEndian(); // noop truncate if (N0.getValueType() == N->getValueType(0)) return N0; // fold (truncate (truncate x)) -> (truncate x) if (N0.getOpcode() == ISD::TRUNCATE) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); // fold (truncate c1) -> c1 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); if (C.getNode() != N) return C; } // fold (truncate (ext x)) -> (ext x) or (truncate x) or x if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { // if the source is smaller than the dest, we still need an extend. if (N0.getOperand(0).getValueType().bitsLT(VT)) return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); // if the source is larger than the dest, than we just need the truncate. if (N0.getOperand(0).getValueType().bitsGT(VT)) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); // if the source and dest are the same type, we can drop both the extend // and the truncate. return N0.getOperand(0); } // If this is anyext(trunc), don't fold it, allow ourselves to be folded. if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND)) return SDValue(); // Fold extract-and-trunc into a narrow extract. For example: // i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1) // i32 y = TRUNCATE(i64 x) // -- becomes -- // v16i8 b = BITCAST (v2i64 val) // i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8) // // Note: We only run this optimization after type legalization (which often // creates this pattern) and before operation legalization after which // we need to be more careful about the vector instructions that we generate. if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) { EVT VecTy = N0.getOperand(0).getValueType(); EVT ExTy = N0.getValueType(); EVT TrTy = N->getValueType(0); unsigned NumElem = VecTy.getVectorNumElements(); unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem); assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size"); SDValue EltNo = N0->getOperand(1); if (isa(EltNo) && isTypeLegal(NVT)) { int Elt = cast(EltNo)->getZExtValue(); EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout()); int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1)); SDLoc DL(N); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy, DAG.getBitcast(NVT, N0.getOperand(0)), DAG.getConstant(Index, DL, IndexTy)); } } // trunc (select c, a, b) -> select c, (trunc a), (trunc b) if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) { EVT SrcVT = N0.getValueType(); if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) && TLI.isTruncateFree(SrcVT, VT)) { SDLoc SL(N0); SDValue Cond = N0.getOperand(0); SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2)); return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1); } } // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits() if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) && TLI.isTypeDesirableForOp(ISD::SHL, VT)) { SDValue Amt = N0.getOperand(1); KnownBits Known; DAG.computeKnownBits(Amt, Known); unsigned Size = VT.getScalarSizeInBits(); if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { SDLoc SL(N); EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); if (AmtVT != Amt.getValueType()) { Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT); AddToWorklist(Amt.getNode()); } return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt); } } // Fold a series of buildvector, bitcast, and truncate if possible. // For example fold // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to // (2xi32 (buildvector x, y)). if (Level == AfterLegalizeVectorOps && VT.isVector() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && N0.getOperand(0).hasOneUse()) { SDValue BuildVect = N0.getOperand(0); EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType(); EVT TruncVecEltTy = VT.getVectorElementType(); // Check that the element types match. if (BuildVectEltTy == TruncVecEltTy) { // Now we only need to compute the offset of the truncated elements. unsigned BuildVecNumElts = BuildVect.getNumOperands(); unsigned TruncVecNumElts = VT.getVectorNumElements(); unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts; assert((BuildVecNumElts % TruncVecNumElts) == 0 && "Invalid number of elements"); SmallVector Opnds; for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset) Opnds.push_back(BuildVect.getOperand(i)); return DAG.getBuildVector(VT, SDLoc(N), Opnds); } } // See if we can simplify the input to this truncate through knowledge that // only the low bits are being used. // For example "trunc (or (shl x, 8), y)" // -> trunc y // Currently we only perform this optimization on scalars because vectors // may have different active low bits. if (!VT.isVector()) { APInt Mask = APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits()); if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask)) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); } // fold (truncate (load x)) -> (smaller load x) // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { if (SDValue Reduced = ReduceLoadWidth(N)) return Reduced; // Handle the case where the load remains an extending load even // after truncation. if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) { LoadSDNode *LN0 = cast(N0); if (!LN0->isVolatile() && LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) { SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), LN0->getMemoryVT(), LN0->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1)); return NewLoad; } } } // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), // where ... are all 'undef'. if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { SmallVector VTs; SDValue V; unsigned Idx = 0; unsigned NumDefs = 0; for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) { SDValue X = N0.getOperand(i); if (!X.isUndef()) { V = X; Idx = i; NumDefs++; } // Stop if more than one members are non-undef. if (NumDefs > 1) break; VTs.push_back(EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), X.getValueType().getVectorNumElements())); } if (NumDefs == 0) return DAG.getUNDEF(VT); if (NumDefs == 1) { assert(V.getNode() && "The single defined operand is empty!"); SmallVector Opnds; for (unsigned i = 0, e = VTs.size(); i != e; ++i) { if (i != Idx) { Opnds.push_back(DAG.getUNDEF(VTs[i])); continue; } SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V); AddToWorklist(NV.getNode()); Opnds.push_back(NV); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds); } } // Fold truncate of a bitcast of a vector to an extract of the low vector // element. // // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) { SDValue VecSrc = N0.getOperand(0); EVT SrcVT = VecSrc.getValueType(); if (SrcVT.isVector() && SrcVT.getScalarType() == VT && (!LegalOperations || TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, SrcVT))) { SDLoc SL(N); EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); unsigned Idx = isLE ? 0 : SrcVT.getVectorNumElements() - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc, DAG.getConstant(Idx, SL, IdxVT)); } } // Simplify the operands using demanded-bits information. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry) // When the adde's carry is not used. if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) && N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) && (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT))) { SDLoc SL(N); auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); auto VTs = DAG.getVTList(VT, N0->getValueType(1)); return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2)); } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } static SDNode *getBuildPairElt(SDNode *N, unsigned i) { SDValue Elt = N->getOperand(i); if (Elt.getOpcode() != ISD::MERGE_VALUES) return Elt.getNode(); return Elt.getOperand(Elt.getResNo()).getNode(); } /// build_pair (load, load) -> load /// if load locations are consecutive. SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) { assert(N->getOpcode() == ISD::BUILD_PAIR); LoadSDNode *LD1 = dyn_cast(getBuildPairElt(N, 0)); LoadSDNode *LD2 = dyn_cast(getBuildPairElt(N, 1)); // A BUILD_PAIR is always having the least significant part in elt 0 and the // most significant part in elt 1. So when combining into one large load, we // need to consider the endianness. if (DAG.getDataLayout().isBigEndian()) std::swap(LD1, LD2); if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() || LD1->getAddressSpace() != LD2->getAddressSpace()) return SDValue(); EVT LD1VT = LD1->getValueType(0); unsigned LD1Bytes = LD1VT.getStoreSize(); if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() && DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) { unsigned Align = LD1->getAlignment(); unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( VT.getTypeForEVT(*DAG.getContext())); if (NewAlign <= Align && (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(), LD1->getPointerInfo(), Align); } return SDValue(); } static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) { // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi // and Lo parts; on big-endian machines it doesn't. return DAG.getDataLayout().isBigEndian() ? 1 : 0; } static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { // If this is not a bitcast to an FP type or if the target doesn't have // IEEE754-compliant FP logic, we're done. EVT VT = N->getValueType(0); if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT)) return SDValue(); // TODO: Use splat values for the constant-checking below and remove this // restriction. SDValue N0 = N->getOperand(0); EVT SourceVT = N0.getValueType(); if (SourceVT.isVector()) return SDValue(); unsigned FPOpcode; APInt SignMask; switch (N0.getOpcode()) { case ISD::AND: FPOpcode = ISD::FABS; SignMask = ~APInt::getSignMask(SourceVT.getSizeInBits()); break; case ISD::XOR: FPOpcode = ISD::FNEG; SignMask = APInt::getSignMask(SourceVT.getSizeInBits()); break; // TODO: ISD::OR --> ISD::FNABS? default: return SDValue(); } // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X SDValue LogicOp0 = N0.getOperand(0); ConstantSDNode *LogicOp1 = dyn_cast(N0.getOperand(1)); if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask && LogicOp0.getOpcode() == ISD::BITCAST && LogicOp0->getOperand(0).getValueType() == VT) return DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0->getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitBITCAST(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.isUndef()) return DAG.getUNDEF(VT); // If the input is a BUILD_VECTOR with all constant elements, fold this now. // Only do this before legalize, since afterward the target may be depending // on the bitconvert. // First check to see if this is all constant. if (!LegalTypes && N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() && VT.isVector()) { bool isSimple = cast(N0)->isConstant(); EVT DestEltVT = N->getValueType(0).getVectorElementType(); assert(!DestEltVT.isVector() && "Element type of vector ValueType must not be vector!"); if (isSimple) return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), DestEltVT); } // If the input is a constant, let getNode fold it. if (isa(N0) || isa(N0)) { // If we can't allow illegal operations, we need to check that this is just // a fp -> int or int -> conversion and that the resulting operation will // be legal. if (!LegalOperations || (isa(N0) && VT.isFloatingPoint() && !VT.isVector() && TLI.isOperationLegal(ISD::ConstantFP, VT)) || (isa(N0) && VT.isInteger() && !VT.isVector() && TLI.isOperationLegal(ISD::Constant, VT))) return DAG.getBitcast(VT, N0); } // (conv (conv x, t1), t2) -> (conv x, t2) if (N0.getOpcode() == ISD::BITCAST) return DAG.getBitcast(VT, N0.getOperand(0)); // fold (conv (load x)) -> (load (conv*)x) // If the resultant load doesn't need a higher alignment than the original! if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile() && // Do not remove the cast if the types differ in endian layout. TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) == TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) && (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) && TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) { LoadSDNode *LN0 = cast(N0); unsigned OrigAlign = LN0->getAlignment(); bool Fast = false; if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, LN0->getAddressSpace(), OrigAlign, &Fast) && Fast) { SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), LN0->getPointerInfo(), OrigAlign, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); return Load; } } if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI)) return V; // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) // // For ppc_fp128: // fold (bitcast (fneg x)) -> // flipbit = signbit // (xor (bitcast x) (build_pair flipbit, flipbit)) // // fold (bitcast (fabs x)) -> // flipbit = (and (extract_element (bitcast x), 0), signbit) // (xor (bitcast x) (build_pair flipbit, flipbit)) // This often reduces constant pool loads. if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) || (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) && N0.getNode()->hasOneUse() && VT.isInteger() && !VT.isVector() && !N0.getValueType().isVector()) { SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0)); AddToWorklist(NewConv.getNode()); SDLoc DL(N); if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { assert(VT.getSizeInBits() == 128); SDValue SignBit = DAG.getConstant( APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); SDValue FlipBit; if (N0.getOpcode() == ISD::FNEG) { FlipBit = SignBit; AddToWorklist(FlipBit.getNode()); } else { assert(N0.getOpcode() == ISD::FABS); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv, DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), SDLoc(NewConv))); AddToWorklist(Hi.getNode()); FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit); AddToWorklist(FlipBit.getNode()); } SDValue FlipBits = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); AddToWorklist(FlipBits.getNode()); return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits); } APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); if (N0.getOpcode() == ISD::FNEG) return DAG.getNode(ISD::XOR, DL, VT, NewConv, DAG.getConstant(SignBit, DL, VT)); assert(N0.getOpcode() == ISD::FABS); return DAG.getNode(ISD::AND, DL, VT, NewConv, DAG.getConstant(~SignBit, DL, VT)); } // fold (bitconvert (fcopysign cst, x)) -> // (or (and (bitconvert x), sign), (and cst, (not sign))) // Note that we don't handle (copysign x, cst) because this can always be // folded to an fneg or fabs. // // For ppc_fp128: // fold (bitcast (fcopysign cst, x)) -> // flipbit = (and (extract_element // (xor (bitcast cst), (bitcast x)), 0), // signbit) // (xor (bitcast cst) (build_pair flipbit, flipbit)) if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() && isa(N0.getOperand(0)) && VT.isInteger() && !VT.isVector()) { unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits(); EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth); if (isTypeLegal(IntXVT)) { SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1)); AddToWorklist(X.getNode()); // If X has a different width than the result/lhs, sext it or truncate it. unsigned VTWidth = VT.getSizeInBits(); if (OrigXWidth < VTWidth) { X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X); AddToWorklist(X.getNode()); } else if (OrigXWidth > VTWidth) { // To get the sign bit in the right place, we have to shift it right // before truncating. SDLoc DL(X); X = DAG.getNode(ISD::SRL, DL, X.getValueType(), X, DAG.getConstant(OrigXWidth-VTWidth, DL, X.getValueType())); AddToWorklist(X.getNode()); X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); AddToWorklist(X.getNode()); } if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2); SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); AddToWorklist(Cst.getNode()); SDValue X = DAG.getBitcast(VT, N0.getOperand(1)); AddToWorklist(X.getNode()); SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X); AddToWorklist(XorResult.getNode()); SDValue XorResult64 = DAG.getNode( ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult, DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), SDLoc(XorResult))); AddToWorklist(XorResult64.getNode()); SDValue FlipBit = DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64, DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64)); AddToWorklist(FlipBit.getNode()); SDValue FlipBits = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); AddToWorklist(FlipBits.getNode()); return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits); } APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); X = DAG.getNode(ISD::AND, SDLoc(X), VT, X, DAG.getConstant(SignBit, SDLoc(X), VT)); AddToWorklist(X.getNode()); SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT, Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT)); AddToWorklist(Cst.getNode()); return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst); } } // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive. if (N0.getOpcode() == ISD::BUILD_PAIR) if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT)) return CombineLD; // Remove double bitcasts from shuffles - this is often a legacy of // XformToShuffleWithZero being used to combine bitmaskings (of // float vectors bitcast to integer vectors) into shuffles. // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1) if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() && N0->getOpcode() == ISD::VECTOR_SHUFFLE && VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() && !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) { ShuffleVectorSDNode *SVN = cast(N0); // If operands are a bitcast, peek through if it casts the original VT. // If operands are a constant, just bitcast back to original VT. auto PeekThroughBitcast = [&](SDValue Op) { if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT) return SDValue(Op.getOperand(0)); if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) return DAG.getBitcast(VT, Op); return SDValue(); }; // FIXME: If either input vector is bitcast, try to convert the shuffle to // the result type of this bitcast. This would eliminate at least one // bitcast. See the transform in InstCombine. SDValue SV0 = PeekThroughBitcast(N0->getOperand(0)); SDValue SV1 = PeekThroughBitcast(N0->getOperand(1)); if (!(SV0 && SV1)) return SDValue(); int MaskScale = VT.getVectorNumElements() / N0.getValueType().getVectorNumElements(); SmallVector NewMask; for (int M : SVN->getMask()) for (int i = 0; i != MaskScale; ++i) NewMask.push_back(M < 0 ? -1 : M * MaskScale + i); bool LegalMask = TLI.isShuffleMaskLegal(NewMask, VT); if (!LegalMask) { std::swap(SV0, SV1); ShuffleVectorSDNode::commuteMask(NewMask); LegalMask = TLI.isShuffleMaskLegal(NewMask, VT); } if (LegalMask) return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask); } return SDValue(); } SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) { EVT VT = N->getValueType(0); return CombineConsecutiveLoads(N, VT); } /// We know that BV is a build_vector node with Constant, ConstantFP or Undef /// operands. DstEltVT indicates the destination element value type. SDValue DAGCombiner:: ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { EVT SrcEltVT = BV->getValueType(0).getVectorElementType(); // If this is already the right type, we're done. if (SrcEltVT == DstEltVT) return SDValue(BV, 0); unsigned SrcBitSize = SrcEltVT.getSizeInBits(); unsigned DstBitSize = DstEltVT.getSizeInBits(); // If this is a conversion of N elements of one type to N elements of another // type, convert each element. This handles FP<->INT cases. if (SrcBitSize == DstBitSize) { EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, BV->getValueType(0).getVectorNumElements()); // Due to the FP element handling below calling this routine recursively, // we can end up with a scalar-to-vector node here. if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT, DAG.getBitcast(DstEltVT, BV->getOperand(0))); SmallVector Ops; for (SDValue Op : BV->op_values()) { // If the vector element type is not legal, the BUILD_VECTOR operands // are promoted and implicitly truncated. Make that explicit here. if (Op.getValueType() != SrcEltVT) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op); Ops.push_back(DAG.getBitcast(DstEltVT, Op)); AddToWorklist(Ops.back().getNode()); } return DAG.getBuildVector(VT, SDLoc(BV), Ops); } // Otherwise, we're growing or shrinking the elements. To avoid having to // handle annoying details of growing/shrinking FP values, we convert them to // int first. if (SrcEltVT.isFloatingPoint()) { // Convert the input float vector to a int vector where the elements are the // same sizes. EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits()); BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode(); SrcEltVT = IntVT; } // Now we know the input is an integer vector. If the output is a FP type, // convert to integer first, then to FP of the right size. if (DstEltVT.isFloatingPoint()) { EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits()); SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode(); // Next, convert to FP elements of the same size. return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT); } SDLoc DL(BV); // Okay, we know the src/dst types are both integers of differing types. // Handling growing first. assert(SrcEltVT.isInteger() && DstEltVT.isInteger()); if (SrcBitSize < DstBitSize) { unsigned NumInputsPerOutput = DstBitSize/SrcBitSize; SmallVector Ops; for (unsigned i = 0, e = BV->getNumOperands(); i != e; i += NumInputsPerOutput) { bool isLE = DAG.getDataLayout().isLittleEndian(); APInt NewBits = APInt(DstBitSize, 0); bool EltIsUndef = true; for (unsigned j = 0; j != NumInputsPerOutput; ++j) { // Shift the previously computed bits over. NewBits <<= SrcBitSize; SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j)); if (Op.isUndef()) continue; EltIsUndef = false; NewBits |= cast(Op)->getAPIntValue(). zextOrTrunc(SrcBitSize).zext(DstBitSize); } if (EltIsUndef) Ops.push_back(DAG.getUNDEF(DstEltVT)); else Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT)); } EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size()); return DAG.getBuildVector(VT, DL, Ops); } // Finally, this must be the case where we are shrinking elements: each input // turns into multiple outputs. unsigned NumOutputsPerInput = SrcBitSize/DstBitSize; EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, NumOutputsPerInput*BV->getNumOperands()); SmallVector Ops; for (const SDValue &Op : BV->op_values()) { if (Op.isUndef()) { Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT)); continue; } APInt OpVal = cast(Op)-> getAPIntValue().zextOrTrunc(SrcBitSize); for (unsigned j = 0; j != NumOutputsPerInput; ++j) { APInt ThisVal = OpVal.trunc(DstBitSize); Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT)); OpVal.lshrInPlace(DstBitSize); } // For big endian targets, swap the order of the pieces of each element. if (DAG.getDataLayout().isBigEndian()) std::reverse(Ops.end()-NumOutputsPerInput, Ops.end()); } return DAG.getBuildVector(VT, DL, Ops); } static bool isContractable(SDNode *N) { SDNodeFlags F = N->getFlags(); return F.hasAllowContract() || F.hasUnsafeAlgebra(); } /// Try to perform FMA combining on a given FADD node. SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || HasFMAD); // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { if (N.getOpcode() != ISD::FMUL) return false; return AllowFusionGlobally || isContractable(N.getNode()); }; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) { if (N0.getNode()->use_size() > N1.getNode()->use_size()) std::swap(N0, N1); } // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), N1); } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), N0); } // Look through FP_EXTEND nodes to do more combining. // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), N1); } } // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) // Note: Commutes FADD operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0); } } // More folding opportunities when target permits. if (Aggressive) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode && N0.getOperand(2).getOpcode() == ISD::FMUL && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), N1)); } // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x)) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. if (Options.UnsafeFPMath && N1->getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FMUL && N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(2).getOperand(0), N1.getOperand(2).getOperand(1), N0)); } // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&] ( SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, U), DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z)); }; if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); if (isContractableFMUL(N020) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), N020.getOperand(0), N020.getOperand(1), N1); } } } // fold (fadd (fpext (fma x, y, (fmul u, v))), z) // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. auto FoldFAddFPExtFMAFMul = [&] ( SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, X), DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, U), DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z)); }; if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); if (isContractableFMUL(N002) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), N002.getOperand(0), N002.getOperand(1), N1); } } } // fold (fadd x, (fma y, z, (fpext (fmul u, v))) // -> (fma y, z, (fma (fpext u), (fpext v), x)) if (N1.getOpcode() == PreferredFusedOpcode) { SDValue N12 = N1.getOperand(2); if (N12.getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N12.getOperand(0); if (isContractableFMUL(N120) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), N120.getOperand(0), N120.getOperand(1), N0); } } } // fold (fadd x, (fpext (fma y, z, (fmul u, v))) // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (N10.getOpcode() == PreferredFusedOpcode) { SDValue N102 = N10.getOperand(2); if (isContractableFMUL(N102) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), N102.getOperand(0), N102.getOperand(1), N0); } } } } return SDValue(); } /// Try to perform FMA combining on a given FSUB node. SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || HasFMAD); // If the subtraction is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { if (N.getOpcode() != ISD::FMUL) return false; return AllowFusionGlobally || isContractable(N.getNode()); }; // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, N1)); } // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), N0); // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N00), N01, DAG.getNode(ISD::FNEG, SL, VT, N1)); } // Look through FP_EXTEND nodes to do more combining. // fold (fsub (fpext (fmul x, y)), z) // -> (fma (fpext x), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), DAG.getNode(ISD::FNEG, SL, VT, N1)); } } // fold (fsub x, (fpext (fmul y, z))) // -> (fma (fneg (fpext y)), (fpext z), x) // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))), DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0); } } // fold (fsub (fpext (fneg (fmul, x, y))), z) // -> (fneg (fma (fpext x), (fpext y), z)) // Note: This could be removed with appropriate canonicalization of the // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent // from implementing the canonicalization in visitFSUB. if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FNEG) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), N1)); } } } // fold (fsub (fneg (fpext (fmul, x, y))), z) // -> (fneg (fma (fpext x)), (fpext y), z) // Note: This could be removed with appropriate canonicalization of the // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent // from implementing the canonicalization in visitFSUB. if (N0.getOpcode() == ISD::FNEG) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FP_EXTEND) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N000.getValueType())) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), N1)); } } } // More folding opportunities when target permits. if (Aggressive) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, N1))); } // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF // are currently only supported on binary nodes. if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N1.getOperand(2))) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N20), N21, N0)); } // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); if (isContractableFMUL(N020) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)), DAG.getNode(ISD::FNEG, SL, VT, N1))); } } } // fold (fsub (fpext (fma x, y, (fmul u, v))), z) // -> (fma (fpext x), (fpext y), // (fma (fpext u), (fpext v), (fneg z))) // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); if (isContractableFMUL(N002) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)), DAG.getNode(ISD::FNEG, SL, VT, N1))); } } } // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) if (N1.getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N1.getOperand(2).getOperand(0); if (isContractableFMUL(N120) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { SDValue N1200 = N120.getOperand(0); SDValue N1201 = N120.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N1200)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0)); } } // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) // -> (fma (fneg (fpext y)), (fpext z), // (fma (fneg (fpext u)), (fpext v), x)) // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. if (N1.getOpcode() == ISD::FP_EXTEND && N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { SDValue CvtSrc = N1.getOperand(0); SDValue N100 = CvtSrc.getOperand(0); SDValue N101 = CvtSrc.getOperand(1); SDValue N102 = CvtSrc.getOperand(2); if (isContractableFMUL(N102) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) { SDValue N1020 = N102.getOperand(0); SDValue N1021 = N102.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N100)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N1020)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0)); } } } return SDValue(); } /// Try to perform FMA combining on a given FMUL node based on the distributive /// law x * (y + 1) = x * y + x and variants thereof (commuted versions, /// subtraction instead of addition). SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); const TargetOptions &Options = DAG.getTarget().Options; // The transforms below are incorrect when x == 0 and y == inf, because the // intermediate multiplication produces a nan. if (!Options.NoInfsFPMath) return SDValue(); // Floating-point multiply-add without intermediate rounding. bool HasFMA = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // Floating-point multiply-add with intermediate rounding. This can result // in a less precise result due to the changed rounding order. bool HasFMAD = Options.UnsafeFPMath && (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y) // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y)) auto FuseFADD = [&](SDValue X, SDValue Y) { if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); if (XC1 && XC1->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); if (XC1 && XC1->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, DAG.getNode(ISD::FNEG, SL, VT, Y)); } return SDValue(); }; if (SDValue FMA = FuseFADD(N0, N1)) return FMA; if (SDValue FMA = FuseFADD(N1, N0)) return FMA; // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y) // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y)) // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y)) // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y) auto FuseFSUB = [&](SDValue X, SDValue Y) { if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { auto XC0 = isConstOrConstSplatFP(X.getOperand(0)); if (XC0 && XC0->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, Y); if (XC0 && XC0->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, DAG.getNode(ISD::FNEG, SL, VT, Y)); auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); if (XC1 && XC1->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, DAG.getNode(ISD::FNEG, SL, VT, Y)); if (XC1 && XC1->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); } return SDValue(); }; if (SDValue FMA = FuseFSUB(N0, N1)) return FMA; if (SDValue FMA = FuseFSUB(N1, N0)) return FMA; return SDValue(); } static bool isFMulNegTwo(SDValue &N) { if (N.getOpcode() != ISD::FMUL) return false; if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1))) return CFP->isExactlyValue(-2.0); return false; } SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0); bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (fadd c1, c2) -> c1 + c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags); // canonicalize constant to RHS if (N0CFP && !N1CFP) return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (fadd A, (fneg B)) -> (fsub A, B) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2) return DAG.getNode(ISD::FSUB, DL, VT, N0, GetNegatedExpression(N1, DAG, LegalOperations), Flags); // fold (fadd (fneg A), B) -> (fsub B, A) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2) return DAG.getNode(ISD::FSUB, DL, VT, N1, GetNegatedExpression(N0, DAG, LegalOperations), Flags); // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B)) // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B)) if ((isFMulNegTwo(N0) && N0.hasOneUse()) || (isFMulNegTwo(N1) && N1.hasOneUse())) { bool N1IsFMul = isFMulNegTwo(N1); SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0); SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags); return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags); } // FIXME: Auto-upgrade the target/function-level option. if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { // fold (fadd A, 0) -> A if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1)) if (N1C->isZero()) return N0; } // If 'unsafe math' is enabled, fold lots of things. if (Options.UnsafeFPMath) { // No FP constant should be created after legalization as Instruction // Selection pass has a hard time dealing with FP constants. bool AllowNewConst = (Level < AfterLegalizeDAG); // fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2)) if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() && isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags), Flags); // If allowed, fold (fadd (fneg x), x) -> 0.0 if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) return DAG.getConstantFP(0.0, DL, VT); // If allowed, fold (fadd x, (fneg x)) -> 0.0 if (AllowNewConst && N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0) return DAG.getConstantFP(0.0, DL, VT); // We can fold chains of FADD's of the same value into multiplications. // This transform is not safe in general because we are reducing the number // of rounding steps. if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { if (N0.getOpcode() == ISD::FMUL) { bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); // (fadd (fmul x, c), x) -> (fmul x, c+1) if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), DAG.getConstantFP(1.0, DL, VT), Flags); return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags); } // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2) if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD && N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), DAG.getConstantFP(2.0, DL, VT), Flags); return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags); } } if (N1.getOpcode() == ISD::FMUL) { bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); // (fadd x, (fmul x, c)) -> (fmul x, c+1) if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), DAG.getConstantFP(1.0, DL, VT), Flags); return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags); } // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2) if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1) && N1.getOperand(0) == N0.getOperand(0)) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), DAG.getConstantFP(2.0, DL, VT), Flags); return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags); } } if (N0.getOpcode() == ISD::FADD && AllowNewConst) { bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); // (fadd (fadd x, x), x) -> (fmul x, 3.0) if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && (N0.getOperand(0) == N1)) { return DAG.getNode(ISD::FMUL, DL, VT, N1, DAG.getConstantFP(3.0, DL, VT), Flags); } } if (N1.getOpcode() == ISD::FADD && AllowNewConst) { bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); // (fadd x, (fadd x, x)) -> (fmul x, 3.0) if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && N1.getOperand(0) == N0) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getConstantFP(3.0, DL, VT), Flags); } } // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0) if (AllowNewConst && N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1) && N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) { return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), DAG.getConstantFP(4.0, DL, VT), Flags); } } } // enable-unsafe-fp-math // FADD -> FMA combines: if (SDValue Fused = visitFADDForFMACombine(N)) { AddToWorklist(Fused.getNode()); return Fused; } return SDValue(); } SDValue DAGCombiner::visitFSUB(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (fsub c1, c2) -> c1-c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (fsub A, (fneg B)) -> (fadd A, B) if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return DAG.getNode(ISD::FADD, DL, VT, N0, GetNegatedExpression(N1, DAG, LegalOperations), Flags); // FIXME: Auto-upgrade the target/function-level option. if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) { // (fsub 0, B) -> -B if (N0CFP && N0CFP->isZero()) { if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return GetNegatedExpression(N1, DAG, LegalOperations); if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags); } } // If 'unsafe math' is enabled, fold lots of things. if (Options.UnsafeFPMath) { // (fsub A, 0) -> A if (N1CFP && N1CFP->isZero()) return N0; // (fsub x, x) -> 0.0 if (N0 == N1) return DAG.getConstantFP(0.0f, DL, VT); // (fsub x, (fadd x, y)) -> (fneg y) // (fsub x, (fadd y, x)) -> (fneg y) if (N1.getOpcode() == ISD::FADD) { SDValue N10 = N1->getOperand(0); SDValue N11 = N1->getOperand(1); if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI, &Options)) return GetNegatedExpression(N11, DAG, LegalOperations); if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI, &Options)) return GetNegatedExpression(N10, DAG, LegalOperations); } } // FSUB -> FMA combines: if (SDValue Fused = visitFSUBForFMACombine(N)) { AddToWorklist(Fused.getNode()); return Fused; } return SDValue(); } SDValue DAGCombiner::visitFMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) { // This just handles C1 * C2 for vectors. Other vector folds are below. if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; } // fold (fmul c1, c2) -> c1*c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags); // canonicalize constant to RHS if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags); // fold (fmul A, 1.0) -> A if (N1CFP && N1CFP->isExactlyValue(1.0)) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; if (Options.UnsafeFPMath) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->isZero()) return N1; // fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2)) if (N0.getOpcode() == ISD::FMUL) { // Fold scalars or any vector constants (not just splats). // This fold is done in general by InstCombine, but extra fmul insts // may have been generated during lowering. SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); auto *BV1 = dyn_cast(N1); auto *BV00 = dyn_cast(N00); auto *BV01 = dyn_cast(N01); // Check 1: Make sure that the first operand of the inner multiply is NOT // a constant. Otherwise, we may induce infinite looping. if (!(isConstOrConstSplatFP(N00) || (BV00 && BV00->isConstant()))) { // Check 2: Make sure that the second operand of the inner multiply and // the second operand of the outer multiply are constants. if ((N1CFP && isConstOrConstSplatFP(N01)) || (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) { SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags); return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags); } } } // fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs // during an early run of DAGCombiner can prevent folding with fmuls // inserted during lowering. if (N0.getOpcode() == ISD::FADD && (N0.getOperand(0) == N0.getOperand(1)) && N0.hasOneUse()) { const SDValue Two = DAG.getConstantFP(2.0, DL, VT); SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags); return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags); } } // fold (fmul X, 2.0) -> (fadd X, X) if (N1CFP && N1CFP->isExactlyValue(+2.0)) return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags); // fold (fmul X, -1.0) -> (fneg X) if (N1CFP && N1CFP->isExactlyValue(-1.0)) if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, DL, VT, N0); // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y) if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) { if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) { // Both can be negated for free, check to see if at least one is cheaper // negated. if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FMUL, DL, VT, GetNegatedExpression(N0, DAG, LegalOperations), GetNegatedExpression(N1, DAG, LegalOperations), Flags); } } // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X) if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() && (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) && TLI.isOperationLegal(ISD::FABS, VT)) { SDValue Select = N0, X = N1; if (Select.getOpcode() != ISD::SELECT) std::swap(Select, X); SDValue Cond = Select.getOperand(0); auto TrueOpnd = dyn_cast(Select.getOperand(1)); auto FalseOpnd = dyn_cast(Select.getOperand(2)); if (TrueOpnd && FalseOpnd && Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X && isa(Cond.getOperand(1)) && cast(Cond.getOperand(1))->isExactlyValue(0.0)) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); switch (CC) { default: break; case ISD::SETOLT: case ISD::SETULT: case ISD::SETOLE: case ISD::SETULE: case ISD::SETLT: case ISD::SETLE: std::swap(TrueOpnd, FalseOpnd); LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETUGT: case ISD::SETOGE: case ISD::SETUGE: case ISD::SETGT: case ISD::SETGE: if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) && TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, DL, VT, DAG.getNode(ISD::FABS, DL, VT, X)); if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0)) return DAG.getNode(ISD::FABS, DL, VT, X); break; } } } // FMUL -> FMA combines: if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) { AddToWorklist(Fused.getNode()); return Fused; } return SDValue(); } SDValue DAGCombiner::visitFMA(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; // Constant fold FMA. if (isa(N0) && isa(N1) && isa(N2)) { return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); } if (Options.UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) return N2; if (N1CFP && N1CFP->isZero()) return N2; } // TODO: The FMA node should have flags that propagate to these nodes. if (N0CFP && N0CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); if (N1CFP && N1CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); // Canonicalize (fma c, x, y) -> (fma x, c, y) if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); // TODO: FMA nodes should have flags that propagate to the created nodes. // For now, create a Flags object for use with all unsafe math transforms. SDNodeFlags Flags; Flags.setUnsafeAlgebra(true); if (Options.UnsafeFPMath) { // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && isConstantFPBuildVectorOrConstantFP(N1) && isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1), Flags), Flags); } // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) if (N0.getOpcode() == ISD::FMUL && isConstantFPBuildVectorOrConstantFP(N1) && isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1), Flags), N2); } } // (fma x, 1, y) -> (fadd x, y) // (fma x, -1, y) -> (fadd (fneg x), y) if (N1CFP) { if (N1CFP->isExactlyValue(1.0)) // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, DL, VT, N0, N2); if (N1CFP->isExactlyValue(-1.0) && (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0); AddToWorklist(RHSNeg.getNode()); // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); } // fma (fneg x), K, y -> fma x -K, y if (N0.getOpcode() == ISD::FNEG && (TLI.isOperationLegal(ISD::ConstantFP, VT) || (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) { return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2); } } if (Options.UnsafeFPMath) { // (fma x, c, x) -> (fmul x, (c+1)) if (N1CFP && N0 == N2) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(1.0, DL, VT), Flags), Flags); } // (fma x, c, (fneg x)) -> (fmul x, (c-1)) if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(-1.0, DL, VT), Flags), Flags); } } return SDValue(); } // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal. // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip) // Notice that this is not always beneficial. One reason is different targets // may have different costs for FDIV and FMUL, so sometimes the cost of two // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; const SDNodeFlags Flags = N->getFlags(); if (!UnsafeMath && !Flags.hasAllowReciprocal()) return SDValue(); // Skip if current node is a reciprocal. SDValue N0 = N->getOperand(0); ConstantFPSDNode *N0CFP = dyn_cast(N0); if (N0CFP && N0CFP->isExactlyValue(1.0)) return SDValue(); // Exit early if the target does not want this transform or if there can't // possibly be enough uses of the divisor to make the transform worthwhile. SDValue N1 = N->getOperand(1); unsigned MinUses = TLI.combineRepeatedFPDivisors(); if (!MinUses || N1->use_size() < MinUses) return SDValue(); // Find all FDIV users of the same divisor. // Use a set because duplicates may be present in the user list. SetVector Users; for (auto *U : N1->uses()) { if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { // This division is eligible for optimization only if global unsafe math // is enabled or if this division allows reciprocal formation. if (UnsafeMath || U->getFlags().hasAllowReciprocal()) Users.insert(U); } } // Now that we have the actual number of divisor uses, make sure it meets // the minimum threshold specified by the target. if (Users.size() < MinUses) return SDValue(); EVT VT = N->getValueType(0); SDLoc DL(N); SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags); // Dividend / Divisor -> Dividend * Reciprocal for (auto *U : Users) { SDValue Dividend = U->getOperand(0); if (Dividend != FPOne) { SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend, Reciprocal, Flags); CombineTo(U, NewNode); } else if (U != Reciprocal.getNode()) { // In the absence of fast-math-flags, this user node is always the // same node as Reciprocal, but with FMF they may be different nodes. CombineTo(U, Reciprocal); } } return SDValue(N, 0); // N was replaced. } SDValue DAGCombiner::visitFDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (fdiv c1, c2) -> c1/c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; if (Options.UnsafeFPMath) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { // Compute the reciprocal 1.0 / c2. const APFloat &N1APF = N1CFP->getValueAPF(); APFloat Recip(N1APF.getSemantics(), 1); // 1.0 APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven); // Only do the transform if the reciprocal is a legal fp immediate that // isn't too nasty (eg NaN, denormal, ...). if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty (!LegalOperations || // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM // backend)... we should handle this gracefully after Legalize. // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) || TLI.isOperationLegal(ISD::ConstantFP, VT) || TLI.isFPImmLegal(Recip, VT))) return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getConstantFP(Recip, DL, VT), Flags); } // If this FDIV is part of a reciprocal square root, it may be folded // into a target-specific square root estimate instruction. if (N1.getOpcode() == ISD::FSQRT) { if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) { return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } else if (N1.getOpcode() == ISD::FP_EXTEND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) { RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } else if (N1.getOpcode() == ISD::FP_ROUND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) { RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } else if (N1.getOpcode() == ISD::FMUL) { // Look through an FMUL. Even though this won't remove the FDIV directly, // it's still worthwhile to get rid of the FSQRT if possible. SDValue SqrtOp; SDValue OtherOp; if (N1.getOperand(0).getOpcode() == ISD::FSQRT) { SqrtOp = N1.getOperand(0); OtherOp = N1.getOperand(1); } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) { SqrtOp = N1.getOperand(1); OtherOp = N1.getOperand(0); } if (SqrtOp.getNode()) { // We found a FSQRT, so try to make this fold: // x / (y * sqrt(z)) -> x * (rsqrt(z) / y) if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) { RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } } // Fold into a reciprocal estimate and multiply instead of a real divide. if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) { AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) { if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) { // Both can be negated for free, check to see if at least one is cheaper // negated. if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, GetNegatedExpression(N0, DAG, LegalOperations), GetNegatedExpression(N1, DAG, LegalOperations), Flags); } } if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N)) return CombineRepeatedDivisors; return SDValue(); } SDValue DAGCombiner::visitFREM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); // fold (frem c1, c2) -> fmod(c1,c2) if (N0CFP && N1CFP) return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags()); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; return SDValue(); } SDValue DAGCombiner::visitFSQRT(SDNode *N) { if (!DAG.getTarget().Options.UnsafeFPMath) return SDValue(); SDValue N0 = N->getOperand(0); if (TLI.isFsqrtCheap(N0, DAG)) return SDValue(); // TODO: FSQRT nodes should have flags that propagate to the created nodes. // For now, create a Flags object for use with all unsafe math transforms. SDNodeFlags Flags; Flags.setUnsafeAlgebra(true); return buildSqrtEstimate(N0, Flags); } /// copysign(x, fp_extend(y)) -> copysign(x, y) /// copysign(x, fp_round(y)) -> copysign(x, y) static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { SDValue N1 = N->getOperand(1); if ((N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)) { // Do not optimize out type conversion of f128 type yet. // For some targets like x86_64, configuration is changed to keep one f128 // value in one SSE register, but instruction selection cannot handle // FCOPYSIGN on SSE registers yet. EVT N1VT = N1->getValueType(0); EVT N1Op0VT = N1->getOperand(0).getValueType(); return (N1VT == N1Op0VT || N1Op0VT != MVT::f128); } return false; } SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); if (N0CFP && N1CFP) // Constant fold return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1); if (N1CFP) { const APFloat &V = N1CFP->getValueAPF(); // copysign(x, c1) -> fabs(x) iff ispos(c1) // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1) if (!V.isNegative()) { if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT)) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); } else { if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0)); } } // copysign(fabs(x), y) -> copysign(x, y) // copysign(fneg(x), y) -> copysign(x, y) // copysign(copysign(x,z), y) -> copysign(x, y) if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1); // copysign(x, abs(y)) -> abs(x) if (N1.getOpcode() == ISD::FABS) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); // copysign(x, copysign(y,z)) -> copysign(x, z) if (N1.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1)); // copysign(x, fp_extend(y)) -> copysign(x, y) // copysign(x, fp_round(y)) -> copysign(x, y) if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); // fold (sint_to_fp c1) -> c1fp if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && // ...but only if the target supports immediate floating-point values (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); // If the input is a legal type, and SINT_TO_FP is not legal on this target, // but UINT_TO_FP is legal on this target, try to convert. if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) && TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) { // If the sign bit is known to be zero, we can change this to UINT_TO_FP. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); } // The next optimizations are desirable only if SELECT_CC can be lowered. if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) { // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && !VT.isVector() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { SDLoc DL(N); SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1), DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT), N0.getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); } // fold (sint_to_fp (zext (setcc x, y, cc))) -> // (select_cc x, y, 1.0, 0.0,, cc) if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { SDLoc DL(N); SDValue Ops[] = { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1), DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT), N0.getOperand(0).getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); } } return SDValue(); } SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); // fold (uint_to_fp c1) -> c1fp if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && // ...but only if the target supports immediate floating-point values (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); // If the input is a legal type, and UINT_TO_FP is not legal on this target, // but SINT_TO_FP is legal on this target, try to convert. if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) && TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) { // If the sign bit is known to be zero, we can change this to SINT_TO_FP. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); } // The next optimizations are desirable only if SELECT_CC can be lowered. if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) { // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { SDLoc DL(N); SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1), DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT), N0.getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); } } return SDValue(); } // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP) return SDValue(); SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP; bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT; // We can safely assume the conversion won't overflow the output range, // because (for example) (uint8_t)18293.f is undefined behavior. // Since we can assume the conversion won't overflow, our decision as to // whether the input will fit in the float should depend on the minimum // of the input range and output range. // This means this is also safe for a signed input and unsigned output, since // a negative input would lead to undefined behavior. unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned; unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned; unsigned ActualSize = std::min(InputSize, OutputSize); const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType()); // We can only fold away the float conversion if the input range can be // represented exactly in the float range. if (APFloat::semanticsPrecision(sem) >= ActualSize) { if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) { unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; return DAG.getNode(ExtOp, SDLoc(N), VT, Src); } if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits()) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src); return DAG.getBitcast(VT, Src); } return SDValue(); } SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (fp_to_sint c1fp) -> c1 if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); return FoldIntToFPToInt(N, DAG); } SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (fp_to_uint c1fp) -> c1 if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); return FoldIntToFPToInt(N, DAG); } SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); EVT VT = N->getValueType(0); // fold (fp_round c1fp) -> c1fp if (N0CFP) return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1); // fold (fp_round (fp_extend x)) -> x if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType()) return N0.getOperand(0); // fold (fp_round (fp_round x)) -> (fp_round x) if (N0.getOpcode() == ISD::FP_ROUND) { const bool NIsTrunc = N->getConstantOperandVal(1) == 1; const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1; // Skip this folding if it results in an fp_round from f80 to f16. // // f80 to f16 always generates an expensive (and as yet, unimplemented) // libcall to __truncxfhf2 instead of selecting native f16 conversion // instructions from f32 or f64. Moreover, the first (value-preserving) // fp_round from f80 to either f32 or f64 may become a NOP in platforms like // x86. if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16) return SDValue(); // If the first fp_round isn't a value preserving truncation, it might // introduce a tie in the second fp_round, that wouldn't occur in the // single-step fp_round we want to fold to. // In other words, double rounding isn't the same as rounding. // Also, this is a value preserving truncation iff both fp_round's are. if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) { SDLoc DL(N); return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0), DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL)); } } // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y) if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) { SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT, N0.getOperand(0), N1); AddToWorklist(Tmp.getNode()); return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, Tmp, N0.getOperand(1)); } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT EVT = cast(N->getOperand(1))->getVT(); ConstantFPSDNode *N0CFP = dyn_cast(N0); // fold (fp_round_inreg c1fp) -> c1fp if (N0CFP && isTypeLegal(EVT)) { SDLoc DL(N); SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), DL, EVT); return DAG.getNode(ISD::FP_EXTEND, DL, VT, Round); } return SDValue(); } SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) return SDValue(); // fold (fp_extend c1fp) -> c1fp if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0); // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op) if (N0.getOpcode() == ISD::FP16_TO_FP && TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal) return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0)); // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the // value of X. if (N0.getOpcode() == ISD::FP_ROUND && N0.getConstantOperandVal(1) == 1) { SDValue In = N0.getOperand(0); if (In.getValueType() == VT) return In; if (VT.bitsLT(In.getValueType())) return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, In, N0.getOperand(1)); return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In); } // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad, DAG.getIntPtrConstant(1, SDLoc(N0))), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } SDValue DAGCombiner::visitFCEIL(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (fceil c1) -> fceil(c1) if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitFTRUNC(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ftrunc c1) -> ftrunc(c1) if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); // fold ftrunc (known rounded int x) -> x // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is // likely to be generated to extract integer from a rounded floating value. switch (N0.getOpcode()) { default: break; case ISD::FRINT: case ISD::FTRUNC: case ISD::FNEARBYINT: case ISD::FFLOOR: case ISD::FCEIL: return N0; } return SDValue(); } SDValue DAGCombiner::visitFFLOOR(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ffloor c1) -> ffloor(c1) if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0); return SDValue(); } // FIXME: FNEG and FABS have a lot in common; refactor. SDValue DAGCombiner::visitFNEG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // Constant fold FNEG. if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(), &DAG.getTarget().Options)) return GetNegatedExpression(N0, DAG, LegalOperations); // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading // constant pool values. if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse()) { SDValue Int = N0.getOperand(0); EVT IntVT = Int.getValueType(); if (IntVT.isInteger() && !IntVT.isVector()) { APInt SignMask; if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x80... per scalar element // and splat it. SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x80... SignMask = APInt::getSignMask(IntVT.getSizeInBits()); } SDLoc DL0(N0); Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int, DAG.getConstant(SignMask, DL0, IntVT)); AddToWorklist(Int.getNode()); return DAG.getBitcast(VT, Int); } } // (fneg (fmul c, x)) -> (fmul -c, x) if (N0.getOpcode() == ISD::FMUL && (N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) { ConstantFPSDNode *CFP1 = dyn_cast(N0.getOperand(1)); if (CFP1) { APFloat CVal = CFP1->getValueAPF(); CVal.changeSign(); if (Level >= AfterLegalizeDAG && (TLI.isFPImmLegal(CVal, VT) || TLI.isOperationLegal(ISD::ConstantFP, VT))) return DAG.getNode( ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)), N0->getFlags()); } } return SDValue(); } SDValue DAGCombiner::visitFMINNUM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); const APFloat &C1 = N1CFP->getValueAPF(); return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT); } // Canonicalize to constant on RHS. if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0); return SDValue(); } SDValue DAGCombiner::visitFMAXNUM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); const APFloat &C1 = N1CFP->getValueAPF(); return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT); } // Canonicalize to constant on RHS. if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0); return SDValue(); } SDValue DAGCombiner::visitFABS(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (fabs c1) -> fabs(c1) if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); // fold (fabs (fabs x)) -> (fabs x) if (N0.getOpcode() == ISD::FABS) return N->getOperand(0); // fold (fabs (fneg x)) -> (fabs x) // fold (fabs (fcopysign x, y)) -> (fabs x) if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0)); // Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading // constant pool values. if (!TLI.isFAbsFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse()) { SDValue Int = N0.getOperand(0); EVT IntVT = Int.getValueType(); if (IntVT.isInteger() && !IntVT.isVector()) { APInt SignMask; if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x7f... per scalar element // and splat it. SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x7f... SignMask = ~APInt::getSignMask(IntVT.getSizeInBits()); } SDLoc DL(N0); Int = DAG.getNode(ISD::AND, DL, IntVT, Int, DAG.getConstant(SignMask, DL, IntVT)); AddToWorklist(Int.getNode()); return DAG.getBitcast(N->getValueType(0), Int); } } return SDValue(); } SDValue DAGCombiner::visitBRCOND(SDNode *N) { SDValue Chain = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); // If N is a constant we could fold this into a fallthrough or unconditional // branch. However that doesn't happen very often in normal code, because // Instcombine/SimplifyCFG should have handled the available opportunities. // If we did this folding here, it would be necessary to update the // MachineBasicBlock CFG, which is awkward. // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal // on the target. if (N1.getOpcode() == ISD::SETCC && TLI.isOperationLegalOrCustom(ISD::BR_CC, N1.getOperand(0).getValueType())) { return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, Chain, N1.getOperand(2), N1.getOperand(0), N1.getOperand(1), N2); } if ((N1.hasOneUse() && N1.getOpcode() == ISD::SRL) || ((N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) && (N1.getOperand(0).hasOneUse() && N1.getOperand(0).getOpcode() == ISD::SRL))) { SDNode *Trunc = nullptr; if (N1.getOpcode() == ISD::TRUNCATE) { // Look pass the truncate. Trunc = N1.getNode(); N1 = N1.getOperand(0); } // Match this pattern so that we can generate simpler code: // // %a = ... // %b = and i32 %a, 2 // %c = srl i32 %b, 1 // brcond i32 %c ... // // into // // %a = ... // %b = and i32 %a, 2 // %c = setcc eq %b, 0 // brcond %c ... // // This applies only when the AND constant value has one bit set and the // SRL constant is equal to the log2 of the AND constant. The back-end is // smart enough to convert the result into a TEST/JMP sequence. SDValue Op0 = N1.getOperand(0); SDValue Op1 = N1.getOperand(1); if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) { SDValue AndOp1 = Op0.getOperand(1); if (AndOp1.getOpcode() == ISD::Constant) { const APInt &AndConst = cast(AndOp1)->getAPIntValue(); if (AndConst.isPowerOf2() && cast(Op1)->getAPIntValue()==AndConst.logBase2()) { SDLoc DL(N); SDValue SetCC = DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()), Op0, DAG.getConstant(0, DL, Op0.getValueType()), ISD::SETNE); SDValue NewBRCond = DAG.getNode(ISD::BRCOND, DL, MVT::Other, Chain, SetCC, N2); // Don't add the new BRCond into the worklist or else SimplifySelectCC // will convert it back to (X & C1) >> C2. CombineTo(N, NewBRCond, false); // Truncate is dead. if (Trunc) deleteAndRecombine(Trunc); // Replace the uses of SRL with SETCC WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N1, SetCC); deleteAndRecombine(N1.getNode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } if (Trunc) // Restore N1 if the above transformation doesn't match. N1 = N->getOperand(1); } // Transform br(xor(x, y)) -> br(x != y) // Transform br(xor(xor(x,y), 1)) -> br (x == y) if (N1.hasOneUse() && N1.getOpcode() == ISD::XOR) { SDNode *TheXor = N1.getNode(); SDValue Op0 = TheXor->getOperand(0); SDValue Op1 = TheXor->getOperand(1); if (Op0.getOpcode() == Op1.getOpcode()) { // Avoid missing important xor optimizations. if (SDValue Tmp = visitXOR(TheXor)) { if (Tmp.getNode() != TheXor) { DEBUG(dbgs() << "\nReplacing.8 "; TheXor->dump(&DAG); dbgs() << "\nWith: "; Tmp.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N1, Tmp); deleteAndRecombine(TheXor); return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, Tmp, N2); } // visitXOR has changed XOR's operands or replaced the XOR completely, // bail out. return SDValue(N, 0); } } if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) { bool Equal = false; if (isOneConstant(Op0) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR) { TheXor = Op0.getNode(); Equal = true; } EVT SetCCVT = N1.getValueType(); if (LegalTypes) SetCCVT = getSetCCResultType(SetCCVT); SDValue SetCC = DAG.getSetCC(SDLoc(TheXor), SetCCVT, Op0, Op1, Equal ? ISD::SETEQ : ISD::SETNE); // Replace the uses of XOR with SETCC WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N1, SetCC); deleteAndRecombine(N1.getNode()); return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, SetCC, N2); } } return SDValue(); } // Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB. // SDValue DAGCombiner::visitBR_CC(SDNode *N) { CondCodeSDNode *CC = cast(N->getOperand(1)); SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3); // If N is a constant we could fold this into a fallthrough or unconditional // branch. However that doesn't happen very often in normal code, because // Instcombine/SimplifyCFG should have handled the available opportunities. // If we did this folding here, it would be necessary to update the // MachineBasicBlock CFG, which is awkward. // Use SimplifySetCC to simplify SETCC's. SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()), CondLHS, CondRHS, CC->get(), SDLoc(N), false); if (Simp.getNode()) AddToWorklist(Simp.getNode()); // fold to a simpler setcc if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC) return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, N->getOperand(0), Simp.getOperand(2), Simp.getOperand(0), Simp.getOperand(1), N->getOperand(4)); return SDValue(); } /// Return true if 'Use' is a load or a store that uses N as its base pointer /// and that N may be folded in the load / store addressing mode. static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG, const TargetLowering &TLI) { EVT VT; unsigned AS; if (LoadSDNode *LD = dyn_cast(Use)) { if (LD->isIndexed() || LD->getBasePtr().getNode() != N) return false; VT = LD->getMemoryVT(); AS = LD->getAddressSpace(); } else if (StoreSDNode *ST = dyn_cast(Use)) { if (ST->isIndexed() || ST->getBasePtr().getNode() != N) return false; VT = ST->getMemoryVT(); AS = ST->getAddressSpace(); } else return false; TargetLowering::AddrMode AM; if (N->getOpcode() == ISD::ADD) { ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); if (Offset) // [reg +/- imm] AM.BaseOffs = Offset->getSExtValue(); else // [reg +/- reg] AM.Scale = 1; } else if (N->getOpcode() == ISD::SUB) { ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); if (Offset) // [reg +/- imm] AM.BaseOffs = -Offset->getSExtValue(); else // [reg +/- reg] AM.Scale = 1; } else return false; return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, VT.getTypeForEVT(*DAG.getContext()), AS); } /// Try turning a load/store into a pre-indexed load/store when the base /// pointer is an add or subtract and it has other uses besides the load/store. /// After the transformation, the new indexed load/store has effectively folded /// the add/subtract in and all of its other uses are redirected to the /// new load/store. bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { if (Level < AfterLegalizeDAG) return false; bool isLoad = true; SDValue Ptr; EVT VT; if (LoadSDNode *LD = dyn_cast(N)) { if (LD->isIndexed()) return false; VT = LD->getMemoryVT(); if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) && !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT)) return false; Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { if (ST->isIndexed()) return false; VT = ST->getMemoryVT(); if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) && !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT)) return false; Ptr = ST->getBasePtr(); isLoad = false; } else { return false; } // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail // out. There is no reason to make this a preinc/predec. if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) || Ptr.getNode()->hasOneUse()) return false; // Ask the target to do addressing mode selection. SDValue BasePtr; SDValue Offset; ISD::MemIndexedMode AM = ISD::UNINDEXED; if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG)) return false; // Backends without true r+i pre-indexed forms may need to pass a // constant base with a variable offset so that constant coercion // will work with the patterns in canonical form. bool Swapped = false; if (isa(BasePtr)) { std::swap(BasePtr, Offset); Swapped = true; } // Don't create a indexed load / store with zero offset. if (isNullConstant(Offset)) return false; // Try turning it into a pre-indexed load / store except when: // 1) The new base ptr is a frame index. // 2) If N is a store and the new base ptr is either the same as or is a // predecessor of the value being stored. // 3) Another use of old base ptr is a predecessor of N. If ptr is folded // that would create a cycle. // 4) All uses are load / store ops that use it as old base ptr. // Check #1. Preinc'ing a frame index would require copying the stack pointer // (plus the implicit offset) to a register to preinc anyway. if (isa(BasePtr) || isa(BasePtr)) return false; // Check #2. if (!isLoad) { SDValue Val = cast(N)->getValue(); if (Val == BasePtr || BasePtr.getNode()->isPredecessorOf(Val.getNode())) return false; } // Caches for hasPredecessorHelper. SmallPtrSet Visited; SmallVector Worklist; Worklist.push_back(N); // If the offset is a constant, there may be other adds of constants that // can be folded with this one. We should do this to avoid having to keep // a copy of the original base pointer. SmallVector OtherUses; if (isa(Offset)) for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(), UE = BasePtr.getNode()->use_end(); UI != UE; ++UI) { SDUse &Use = UI.getUse(); // Skip the use that is Ptr and uses of other results from BasePtr's // node (important for nodes that return multiple results). if (Use.getUser() == Ptr.getNode() || Use != BasePtr) continue; if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist)) continue; if (Use.getUser()->getOpcode() != ISD::ADD && Use.getUser()->getOpcode() != ISD::SUB) { OtherUses.clear(); break; } SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1); if (!isa(Op1)) { OtherUses.clear(); break; } // FIXME: In some cases, we can be smarter about this. if (Op1.getValueType() != Offset.getValueType()) { OtherUses.clear(); break; } OtherUses.push_back(Use.getUser()); } if (Swapped) std::swap(BasePtr, Offset); // Now check for #3 and #4. bool RealUse = false; for (SDNode *Use : Ptr.getNode()->uses()) { if (Use == N) continue; if (SDNode::hasPredecessorHelper(Use, Visited, Worklist)) return false; // If Ptr may be folded in addressing mode of other use, then it's // not profitable to do this transformation. if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI)) RealUse = true; } if (!RealUse) return false; SDValue Result; if (isLoad) Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM); else Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM); ++PreIndexedNodes; ++NodesCombined; DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); if (isLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); } // Finally, since the node is now dead, remove it from the graph. deleteAndRecombine(N); if (Swapped) std::swap(BasePtr, Offset); // Replace other uses of BasePtr that can be updated to use Ptr for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) { unsigned OffsetIdx = 1; if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode()) OffsetIdx = 0; assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() == BasePtr.getNode() && "Expected BasePtr operand"); // We need to replace ptr0 in the following expression: // x0 * offset0 + y0 * ptr0 = t0 // knowing that // x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store) // // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the // indexed load/store and the expression that needs to be re-written. // // Therefore, we have: // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1 ConstantSDNode *CN = cast(OtherUses[i]->getOperand(OffsetIdx)); int X0, X1, Y0, Y1; const APInt &Offset0 = CN->getAPIntValue(); APInt Offset1 = cast(Offset)->getAPIntValue(); X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1; Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1; X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1; Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1; unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD; APInt CNV = Offset0; if (X0 < 0) CNV = -CNV; if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1; else CNV = CNV - Offset1; SDLoc DL(OtherUses[i]); // We can now generate the new expression. SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0)); SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0); SDValue NewUse = DAG.getNode(Opcode, DL, OtherUses[i]->getValueType(0), NewOp1, NewOp2); DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse); deleteAndRecombine(OtherUses[i]); } // Replace the uses of Ptr with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0)); deleteAndRecombine(Ptr.getNode()); AddToWorklist(Result.getNode()); return true; } /// Try to combine a load/store with a add/sub of the base pointer node into a /// post-indexed load/store. The transformation folded the add/subtract into the /// new indexed load/store effectively and all of its uses are redirected to the /// new load/store. bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { if (Level < AfterLegalizeDAG) return false; bool isLoad = true; SDValue Ptr; EVT VT; if (LoadSDNode *LD = dyn_cast(N)) { if (LD->isIndexed()) return false; VT = LD->getMemoryVT(); if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) && !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT)) return false; Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { if (ST->isIndexed()) return false; VT = ST->getMemoryVT(); if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) && !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT)) return false; Ptr = ST->getBasePtr(); isLoad = false; } else { return false; } if (Ptr.getNode()->hasOneUse()) return false; for (SDNode *Op : Ptr.getNode()->uses()) { if (Op == N || (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)) continue; SDValue BasePtr; SDValue Offset; ISD::MemIndexedMode AM = ISD::UNINDEXED; if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) { // Don't create a indexed load / store with zero offset. if (isNullConstant(Offset)) continue; // Try turning it into a post-indexed load / store except when // 1) All uses are load / store ops that use it as base ptr (and // it may be folded as addressing mmode). // 2) Op must be independent of N, i.e. Op is neither a predecessor // nor a successor of N. Otherwise, if Op is folded that would // create a cycle. if (isa(BasePtr) || isa(BasePtr)) continue; // Check for #1. bool TryNext = false; for (SDNode *Use : BasePtr.getNode()->uses()) { if (Use == Ptr.getNode()) continue; // If all the uses are load / store addresses, then don't do the // transformation. if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){ bool RealUse = false; for (SDNode *UseUse : Use->uses()) { if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) RealUse = true; } if (!RealUse) { TryNext = true; break; } } } if (TryNext) continue; // Check for #2 if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) { SDValue Result = isLoad ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM) : DAG.getIndexedStore(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM); ++PostIndexedNodes; ++NodesCombined; DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); if (isLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); } // Finally, since the node is now dead, remove it from the graph. deleteAndRecombine(N); // Replace the uses of Use with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), Result.getValue(isLoad ? 1 : 0)); deleteAndRecombine(Op); return true; } } } return false; } /// \brief Return the base-pointer arithmetic from an indexed \p LD. SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) { ISD::MemIndexedMode AM = LD->getAddressingMode(); assert(AM != ISD::UNINDEXED); SDValue BP = LD->getOperand(1); SDValue Inc = LD->getOperand(2); // Some backends use TargetConstants for load offsets, but don't expect // TargetConstants in general ADD nodes. We can convert these constants into // regular Constants (if the constant is not opaque). assert((Inc.getOpcode() != ISD::TargetConstant || !cast(Inc)->isOpaque()) && "Cannot split out indexing using opaque target constants"); if (Inc.getOpcode() == ISD::TargetConstant) { ConstantSDNode *ConstInc = cast(Inc); Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc), ConstInc->getValueType(0)); } unsigned Opc = (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB); return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); } SDValue DAGCombiner::visitLOAD(SDNode *N) { LoadSDNode *LD = cast(N); SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); // If load is not volatile and there are no uses of the loaded value (and // the updated indexed value in case of indexed loads), change uses of the // chain value into uses of the chain input (i.e. delete the dead load). if (!LD->isVolatile()) { if (N->getValueType(1) == MVT::Other) { // Unindexed loads. if (!N->hasAnyUseOfValue(0)) { // It's not safe to use the two value CombineTo variant here. e.g. // v1, chain2 = load chain1, loc // v2, chain3 = load chain2, loc // v3 = add v2, c // Now we replace use of chain2 with chain1. This makes the second load // isomorphic to the one we are deleting, and thus makes this load live. DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG); dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG); dbgs() << "\n"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); AddUsersToWorklist(Chain.getNode()); if (N->use_empty()) deleteAndRecombine(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } else { // Indexed loads. assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?"); // If this load has an opaque TargetConstant offset, then we cannot split // the indexing into an add/sub directly (that TargetConstant may not be // valid for a different type of node, and we cannot convert an opaque // target constant into a regular constant). bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant && cast(LD->getOperand(2))->isOpaque(); if (!N->hasAnyUseOfValue(0) && ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) { SDValue Undef = DAG.getUNDEF(N->getValueType(0)); SDValue Index; if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) { Index = SplitIndexingFromLoad(LD); // Try to fold the base pointer arithmetic into subsequent loads and // stores. AddUsersToWorklist(N); } else Index = DAG.getUNDEF(N->getValueType(1)); DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG); dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG); dbgs() << " and 2 other values\n"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain); deleteAndRecombine(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } // If this load is directly stored, replace the load value with the stored // value. // TODO: Handle store large -> read small portion. // TODO: Handle TRUNCSTORE/LOADEXT if (OptLevel != CodeGenOpt::None && ISD::isNormalLoad(N) && !LD->isVolatile()) { if (ISD::isNON_TRUNCStore(Chain.getNode())) { StoreSDNode *PrevST = cast(Chain); if (PrevST->getBasePtr() == Ptr && PrevST->getValue().getValueType() == N->getValueType(0)) return CombineTo(N, PrevST->getOperand(1), Chain); } } // Try to infer better alignment information than the load already has. if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) { if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { if (Align > LD->getMemOperand()->getBaseAlignment()) { SDValue NewLoad = DAG.getExtLoad( LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr, LD->getPointerInfo(), LD->getMemoryVT(), Align, LD->getMemOperand()->getFlags(), LD->getAAInfo()); if (NewLoad.getNode() != N) return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true); } } } if (LD->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes. SDValue BetterChain = FindBetterChain(N, Chain); // If there is a better chain. if (Chain != BetterChain) { SDValue ReplLoad; // Replace the chain to void dependency. if (LD->getExtensionType() == ISD::NON_EXTLOAD) { ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD), BetterChain, Ptr, LD->getMemOperand()); } else { ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), LD->getValueType(0), BetterChain, Ptr, LD->getMemoryVT(), LD->getMemOperand()); } // Create token factor to keep old chain connected. SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Chain, ReplLoad.getValue(1)); // Replace uses with load result and token factor return CombineTo(N, ReplLoad.getValue(0), Token); } } // Try transforming N to an indexed load. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); // Try to slice up N to more direct loads if the slices are mapped to // different register banks or pairing can take place. if (SliceUpLoad(N)) return SDValue(N, 0); return SDValue(); } namespace { /// \brief Helper structure used to slice a load in smaller loads. /// Basically a slice is obtained from the following sequence: /// Origin = load Ty1, Base /// Shift = srl Ty1 Origin, CstTy Amount /// Inst = trunc Shift to Ty2 /// /// Then, it will be rewritten into: /// Slice = load SliceTy, Base + SliceOffset /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2 /// /// SliceTy is deduced from the number of bits that are actually used to /// build Inst. struct LoadedSlice { /// \brief Helper structure used to compute the cost of a slice. struct Cost { /// Are we optimizing for code size. bool ForCodeSize; /// Various cost. unsigned Loads = 0; unsigned Truncates = 0; unsigned CrossRegisterBanksCopies = 0; unsigned ZExts = 0; unsigned Shift = 0; Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {} /// \brief Get the cost of one isolated slice. Cost(const LoadedSlice &LS, bool ForCodeSize = false) : ForCodeSize(ForCodeSize), Loads(1) { EVT TruncType = LS.Inst->getValueType(0); EVT LoadedType = LS.getLoadedType(); if (TruncType != LoadedType && !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType)) ZExts = 1; } /// \brief Account for slicing gain in the current cost. /// Slicing provide a few gains like removing a shift or a /// truncate. This method allows to grow the cost of the original /// load with the gain from this slice. void addSliceGain(const LoadedSlice &LS) { // Each slice saves a truncate. const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo(); if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(), LS.Inst->getValueType(0))) ++Truncates; // If there is a shift amount, this slice gets rid of it. if (LS.Shift) ++Shift; // If this slice can merge a cross register bank copy, account for it. if (LS.canMergeExpensiveCrossRegisterBankCopy()) ++CrossRegisterBanksCopies; } Cost &operator+=(const Cost &RHS) { Loads += RHS.Loads; Truncates += RHS.Truncates; CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies; ZExts += RHS.ZExts; Shift += RHS.Shift; return *this; } bool operator==(const Cost &RHS) const { return Loads == RHS.Loads && Truncates == RHS.Truncates && CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies && ZExts == RHS.ZExts && Shift == RHS.Shift; } bool operator!=(const Cost &RHS) const { return !(*this == RHS); } bool operator<(const Cost &RHS) const { // Assume cross register banks copies are as expensive as loads. // FIXME: Do we want some more target hooks? unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies; unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies; // Unless we are optimizing for code size, consider the // expensive operation first. if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS) return ExpensiveOpsLHS < ExpensiveOpsRHS; return (Truncates + ZExts + Shift + ExpensiveOpsLHS) < (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS); } bool operator>(const Cost &RHS) const { return RHS < *this; } bool operator<=(const Cost &RHS) const { return !(RHS < *this); } bool operator>=(const Cost &RHS) const { return !(*this < RHS); } }; // The last instruction that represent the slice. This should be a // truncate instruction. SDNode *Inst; // The original load instruction. LoadSDNode *Origin; // The right shift amount in bits from the original load. unsigned Shift; // The DAG from which Origin came from. // This is used to get some contextual information about legal types, etc. SelectionDAG *DAG; LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr, unsigned Shift = 0, SelectionDAG *DAG = nullptr) : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {} /// \brief Get the bits used in a chunk of bits \p BitWidth large. /// \return Result is \p BitWidth and has used bits set to 1 and /// not used bits set to 0. APInt getUsedBits() const { // Reproduce the trunc(lshr) sequence: // - Start from the truncated value. // - Zero extend to the desired bit width. // - Shift left. assert(Origin && "No original load to compare against."); unsigned BitWidth = Origin->getValueSizeInBits(0); assert(Inst && "This slice is not bound to an instruction"); assert(Inst->getValueSizeInBits(0) <= BitWidth && "Extracted slice is bigger than the whole type!"); APInt UsedBits(Inst->getValueSizeInBits(0), 0); UsedBits.setAllBits(); UsedBits = UsedBits.zext(BitWidth); UsedBits <<= Shift; return UsedBits; } /// \brief Get the size of the slice to be loaded in bytes. unsigned getLoadedSize() const { unsigned SliceSize = getUsedBits().countPopulation(); assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte."); return SliceSize / 8; } /// \brief Get the type that will be loaded for this slice. /// Note: This may not be the final type for the slice. EVT getLoadedType() const { assert(DAG && "Missing context"); LLVMContext &Ctxt = *DAG->getContext(); return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8); } /// \brief Get the alignment of the load used for this slice. unsigned getAlignment() const { unsigned Alignment = Origin->getAlignment(); unsigned Offset = getOffsetFromBase(); if (Offset != 0) Alignment = MinAlign(Alignment, Alignment + Offset); return Alignment; } /// \brief Check if this slice can be rewritten with legal operations. bool isLegal() const { // An invalid slice is not legal. if (!Origin || !Inst || !DAG) return false; // Offsets are for indexed load only, we do not handle that. if (!Origin->getOffset().isUndef()) return false; const TargetLowering &TLI = DAG->getTargetLoweringInfo(); // Check that the type is legal. EVT SliceType = getLoadedType(); if (!TLI.isTypeLegal(SliceType)) return false; // Check that the load is legal for this type. if (!TLI.isOperationLegal(ISD::LOAD, SliceType)) return false; // Check that the offset can be computed. // 1. Check its type. EVT PtrType = Origin->getBasePtr().getValueType(); if (PtrType == MVT::Untyped || PtrType.isExtended()) return false; // 2. Check that it fits in the immediate. if (!TLI.isLegalAddImmediate(getOffsetFromBase())) return false; // 3. Check that the computation is legal. if (!TLI.isOperationLegal(ISD::ADD, PtrType)) return false; // Check that the zext is legal if it needs one. EVT TruncateType = Inst->getValueType(0); if (TruncateType != SliceType && !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType)) return false; return true; } /// \brief Get the offset in bytes of this slice in the original chunk of /// bits. /// \pre DAG != nullptr. uint64_t getOffsetFromBase() const { assert(DAG && "Missing context."); bool IsBigEndian = DAG->getDataLayout().isBigEndian(); assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported."); uint64_t Offset = Shift / 8; unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8; assert(!(Origin->getValueSizeInBits(0) & 0x7) && "The size of the original loaded type is not a multiple of a" " byte."); // If Offset is bigger than TySizeInBytes, it means we are loading all // zeros. This should have been optimized before in the process. assert(TySizeInBytes > Offset && "Invalid shift amount for given loaded size"); if (IsBigEndian) Offset = TySizeInBytes - Offset - getLoadedSize(); return Offset; } /// \brief Generate the sequence of instructions to load the slice /// represented by this object and redirect the uses of this slice to /// this new sequence of instructions. /// \pre this->Inst && this->Origin are valid Instructions and this /// object passed the legal check: LoadedSlice::isLegal returned true. /// \return The last instruction of the sequence used to load the slice. SDValue loadSlice() const { assert(Inst && Origin && "Unable to replace a non-existing slice."); const SDValue &OldBaseAddr = Origin->getBasePtr(); SDValue BaseAddr = OldBaseAddr; // Get the offset in that chunk of bytes w.r.t. the endianness. int64_t Offset = static_cast(getOffsetFromBase()); assert(Offset >= 0 && "Offset too big to fit in int64_t!"); if (Offset) { // BaseAddr = BaseAddr + Offset. EVT ArithType = BaseAddr.getValueType(); SDLoc DL(Origin); BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr, DAG->getConstant(Offset, DL, ArithType)); } // Create the type of the loaded slice according to its size. EVT SliceType = getLoadedType(); // Create the load for the slice. SDValue LastInst = DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr, Origin->getPointerInfo().getWithOffset(Offset), getAlignment(), Origin->getMemOperand()->getFlags()); // If the final type is not the same as the loaded type, this means that // we have to pad with zero. Create a zero extend for that. EVT FinalType = Inst->getValueType(0); if (SliceType != FinalType) LastInst = DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst); return LastInst; } /// \brief Check if this slice can be merged with an expensive cross register /// bank copy. E.g., /// i = load i32 /// f = bitcast i32 i to float bool canMergeExpensiveCrossRegisterBankCopy() const { if (!Inst || !Inst->hasOneUse()) return false; SDNode *Use = *Inst->use_begin(); if (Use->getOpcode() != ISD::BITCAST) return false; assert(DAG && "Missing context"); const TargetLowering &TLI = DAG->getTargetLoweringInfo(); EVT ResVT = Use->getValueType(0); const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT()); const TargetRegisterClass *ArgRC = TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT()); if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT)) return false; // At this point, we know that we perform a cross-register-bank copy. // Check if it is expensive. const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo(); // Assume bitcasts are cheap, unless both register classes do not // explicitly share a common sub class. if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC)) return false; // Check if it will be merged with the load. // 1. Check the alignment constraint. unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment( ResVT.getTypeForEVT(*DAG->getContext())); if (RequiredAlignment > getAlignment()) return false; // 2. Check that the load is a legal operation for that type. if (!TLI.isOperationLegal(ISD::LOAD, ResVT)) return false; // 3. Check that we do not have a zext in the way. if (Inst->getValueType(0) != getLoadedType()) return false; return true; } }; } // end anonymous namespace /// \brief Check that all bits set in \p UsedBits form a dense region, i.e., /// \p UsedBits looks like 0..0 1..1 0..0. static bool areUsedBitsDense(const APInt &UsedBits) { // If all the bits are one, this is dense! if (UsedBits.isAllOnesValue()) return true; // Get rid of the unused bits on the right. APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros()); // Get rid of the unused bits on the left. if (NarrowedUsedBits.countLeadingZeros()) NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits()); // Check that the chunk of bits is completely used. return NarrowedUsedBits.isAllOnesValue(); } /// \brief Check whether or not \p First and \p Second are next to each other /// in memory. This means that there is no hole between the bits loaded /// by \p First and the bits loaded by \p Second. static bool areSlicesNextToEachOther(const LoadedSlice &First, const LoadedSlice &Second) { assert(First.Origin == Second.Origin && First.Origin && "Unable to match different memory origins."); APInt UsedBits = First.getUsedBits(); assert((UsedBits & Second.getUsedBits()) == 0 && "Slices are not supposed to overlap."); UsedBits |= Second.getUsedBits(); return areUsedBitsDense(UsedBits); } /// \brief Adjust the \p GlobalLSCost according to the target /// paring capabilities and the layout of the slices. /// \pre \p GlobalLSCost should account for at least as many loads as /// there is in the slices in \p LoadedSlices. static void adjustCostForPairing(SmallVectorImpl &LoadedSlices, LoadedSlice::Cost &GlobalLSCost) { unsigned NumberOfSlices = LoadedSlices.size(); // If there is less than 2 elements, no pairing is possible. if (NumberOfSlices < 2) return; // Sort the slices so that elements that are likely to be next to each // other in memory are next to each other in the list. std::sort(LoadedSlices.begin(), LoadedSlices.end(), [](const LoadedSlice &LHS, const LoadedSlice &RHS) { assert(LHS.Origin == RHS.Origin && "Different bases not implemented."); return LHS.getOffsetFromBase() < RHS.getOffsetFromBase(); }); const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo(); // First (resp. Second) is the first (resp. Second) potentially candidate // to be placed in a paired load. const LoadedSlice *First = nullptr; const LoadedSlice *Second = nullptr; for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice, // Set the beginning of the pair. First = Second) { Second = &LoadedSlices[CurrSlice]; // If First is NULL, it means we start a new pair. // Get to the next slice. if (!First) continue; EVT LoadedType = First->getLoadedType(); // If the types of the slices are different, we cannot pair them. if (LoadedType != Second->getLoadedType()) continue; // Check if the target supplies paired loads for this type. unsigned RequiredAlignment = 0; if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) { // move to the next pair, this type is hopeless. Second = nullptr; continue; } // Check if we meet the alignment requirement. if (RequiredAlignment > First->getAlignment()) continue; // Check that both loads are next to each other in memory. if (!areSlicesNextToEachOther(*First, *Second)) continue; assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!"); --GlobalLSCost.Loads; // Move to the next pair. Second = nullptr; } } /// \brief Check the profitability of all involved LoadedSlice. /// Currently, it is considered profitable if there is exactly two /// involved slices (1) which are (2) next to each other in memory, and /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3). /// /// Note: The order of the elements in \p LoadedSlices may be modified, but not /// the elements themselves. /// /// FIXME: When the cost model will be mature enough, we can relax /// constraints (1) and (2). static bool isSlicingProfitable(SmallVectorImpl &LoadedSlices, const APInt &UsedBits, bool ForCodeSize) { unsigned NumberOfSlices = LoadedSlices.size(); if (StressLoadSlicing) return NumberOfSlices > 1; // Check (1). if (NumberOfSlices != 2) return false; // Check (2). if (!areUsedBitsDense(UsedBits)) return false; // Check (3). LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize); // The original code has one big load. OrigCost.Loads = 1; for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) { const LoadedSlice &LS = LoadedSlices[CurrSlice]; // Accumulate the cost of all the slices. LoadedSlice::Cost SliceCost(LS, ForCodeSize); GlobalSlicingCost += SliceCost; // Account as cost in the original configuration the gain obtained // with the current slices. OrigCost.addSliceGain(LS); } // If the target supports paired load, adjust the cost accordingly. adjustCostForPairing(LoadedSlices, GlobalSlicingCost); return OrigCost > GlobalSlicingCost; } /// \brief If the given load, \p LI, is used only by trunc or trunc(lshr) /// operations, split it in the various pieces being extracted. /// /// This sort of thing is introduced by SROA. /// This slicing takes care not to insert overlapping loads. /// \pre LI is a simple load (i.e., not an atomic or volatile load). bool DAGCombiner::SliceUpLoad(SDNode *N) { if (Level < AfterLegalizeDAG) return false; LoadSDNode *LD = cast(N); if (LD->isVolatile() || !ISD::isNormalLoad(LD) || !LD->getValueType(0).isInteger()) return false; // Keep track of already used bits to detect overlapping values. // In that case, we will just abort the transformation. APInt UsedBits(LD->getValueSizeInBits(0), 0); SmallVector LoadedSlices; // Check if this load is used as several smaller chunks of bits. // Basically, look for uses in trunc or trunc(lshr) and record a new chain // of computation for each trunc. for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end(); UI != UIEnd; ++UI) { // Skip the uses of the chain. if (UI.getUse().getResNo() != 0) continue; SDNode *User = *UI; unsigned Shift = 0; // Check if this is a trunc(lshr). if (User->getOpcode() == ISD::SRL && User->hasOneUse() && isa(User->getOperand(1))) { Shift = User->getConstantOperandVal(1); User = *User->use_begin(); } // At this point, User is a Truncate, iff we encountered, trunc or // trunc(lshr). if (User->getOpcode() != ISD::TRUNCATE) return false; // The width of the type must be a power of 2 and greater than 8-bits. // Otherwise the load cannot be represented in LLVM IR. // Moreover, if we shifted with a non-8-bits multiple, the slice // will be across several bytes. We do not support that. unsigned Width = User->getValueSizeInBits(0); if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) return false; // Build the slice for this chain of computations. LoadedSlice LS(User, LD, Shift, &DAG); APInt CurrentUsedBits = LS.getUsedBits(); // Check if this slice overlaps with another. if ((CurrentUsedBits & UsedBits) != 0) return false; // Update the bits used globally. UsedBits |= CurrentUsedBits; // Check if the new slice would be legal. if (!LS.isLegal()) return false; // Record the slice. LoadedSlices.push_back(LS); } // Abort slicing if it does not seem to be profitable. if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize)) return false; ++SlicedLoads; // Rewrite each chain to use an independent load. // By construction, each chain can be represented by a unique load. // Prepare the argument for the new token factor for all the slices. SmallVector ArgChains; for (SmallVectorImpl::const_iterator LSIt = LoadedSlices.begin(), LSItEnd = LoadedSlices.end(); LSIt != LSItEnd; ++LSIt) { SDValue SliceInst = LSIt->loadSlice(); CombineTo(LSIt->Inst, SliceInst, true); if (SliceInst.getOpcode() != ISD::LOAD) SliceInst = SliceInst.getOperand(0); assert(SliceInst->getOpcode() == ISD::LOAD && "It takes more than a zext to get to the loaded slice!!"); ArgChains.push_back(SliceInst.getValue(1)); } SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, ArgChains); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); AddToWorklist(Chain.getNode()); return true; } /// Check to see if V is (and load (ptr), imm), where the load is having /// specific bytes cleared out. If so, return the byte size being masked out /// and the shift amount. static std::pair CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { std::pair Result(0, 0); // Check for the structure we're looking for. if (V->getOpcode() != ISD::AND || !isa(V->getOperand(1)) || !ISD::isNormalLoad(V->getOperand(0).getNode())) return Result; // Check the chain and pointer. LoadSDNode *LD = cast(V->getOperand(0)); if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer. // The store should be chained directly to the load or be an operand of a // tokenfactor. if (LD == Chain.getNode()) ; // ok. else if (Chain->getOpcode() != ISD::TokenFactor) return Result; // Fail. else { bool isOk = false; for (const SDValue &ChainOp : Chain->op_values()) if (ChainOp.getNode() == LD) { isOk = true; break; } if (!isOk) return Result; } // This only handles simple types. if (V.getValueType() != MVT::i16 && V.getValueType() != MVT::i32 && V.getValueType() != MVT::i64) return Result; // Check the constant mask. Invert it so that the bits being masked out are // 0 and the bits being kept are 1. Use getSExtValue so that leading bits // follow the sign bit for uniformity. uint64_t NotMask = ~cast(V->getOperand(1))->getSExtValue(); unsigned NotMaskLZ = countLeadingZeros(NotMask); if (NotMaskLZ & 7) return Result; // Must be multiple of a byte. unsigned NotMaskTZ = countTrailingZeros(NotMask); if (NotMaskTZ & 7) return Result; // Must be multiple of a byte. if (NotMaskLZ == 64) return Result; // All zero mask. // See if we have a continuous run of bits. If so, we have 0*1+0* if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64) return Result; // Adjust NotMaskLZ down to be from the actual size of the int instead of i64. if (V.getValueType() != MVT::i64 && NotMaskLZ) NotMaskLZ -= 64-V.getValueSizeInBits(); unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8; switch (MaskedBytes) { case 1: case 2: case 4: break; default: return Result; // All one mask, or 5-byte mask. } // Verify that the first bit starts at a multiple of mask so that the access // is aligned the same as the access width. if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; Result.first = MaskedBytes; Result.second = NotMaskTZ/8; return Result; } /// Check to see if IVal is something that provides a value as specified by /// MaskInfo. If so, replace the specified store with a narrower store of /// truncated IVal. static SDNode * ShrinkLoadReplaceStoreWithStore(const std::pair &MaskInfo, SDValue IVal, StoreSDNode *St, DAGCombiner *DC) { unsigned NumBytes = MaskInfo.first; unsigned ByteShift = MaskInfo.second; SelectionDAG &DAG = DC->getDAG(); // Check to see if IVal is all zeros in the part being masked in by the 'or' // that uses this. If not, this is not a replacement. APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(), ByteShift*8, (ByteShift+NumBytes)*8); if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr; // Check that it is legal on the target to do this. It is legal if the new // VT we're shrinking to (i8/i16/i32) is legal or we're still before type // legalization. MVT VT = MVT::getIntegerVT(NumBytes*8); if (!DC->isTypeLegal(VT)) return nullptr; // Okay, we can do this! Replace the 'St' store with a store of IVal that is // shifted by ByteShift and truncated down to NumBytes. if (ByteShift) { SDLoc DL(IVal); IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal, DAG.getConstant(ByteShift*8, DL, DC->getShiftAmountTy(IVal.getValueType()))); } // Figure out the offset for the store and the alignment of the access. unsigned StOffset; unsigned NewAlign = St->getAlignment(); if (DAG.getDataLayout().isLittleEndian()) StOffset = ByteShift; else StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes; SDValue Ptr = St->getBasePtr(); if (StOffset) { SDLoc DL(IVal); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(StOffset, DL, Ptr.getValueType())); NewAlign = MinAlign(NewAlign, StOffset); } // Truncate down to the new size. IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal); ++OpsNarrowed; return DAG .getStore(St->getChain(), SDLoc(St), IVal, Ptr, St->getPointerInfo().getWithOffset(StOffset), NewAlign) .getNode(); } /// Look for sequence of load / op / store where op is one of 'or', 'xor', and /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try /// narrowing the load and store if it would end up being a win for performance /// or code size. SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { StoreSDNode *ST = cast(N); if (ST->isVolatile()) return SDValue(); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); EVT VT = Value.getValueType(); if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) return SDValue(); unsigned Opc = Value.getOpcode(); // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst // is a byte mask indicating a consecutive number of bytes, check to see if // Y is known to provide just those bytes. If so, we try to replace the // load + replace + store sequence with a single (narrower) store, which makes // the load dead. if (Opc == ISD::OR) { std::pair MaskedLoad; MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain); if (MaskedLoad.first) if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, Value.getOperand(1), ST,this)) return SDValue(NewST, 0); // Or is commutative, so try swapping X and Y. MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain); if (MaskedLoad.first) if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, Value.getOperand(0), ST,this)) return SDValue(NewST, 0); } if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) || Value.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); SDValue N0 = Value.getOperand(0); if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && Chain == SDValue(N0.getNode(), 1)) { LoadSDNode *LD = cast(N0); if (LD->getBasePtr() != Ptr || LD->getPointerInfo().getAddrSpace() != ST->getPointerInfo().getAddrSpace()) return SDValue(); // Find the type to narrow it the load / op / store to. SDValue N1 = Value.getOperand(1); unsigned BitWidth = N1.getValueSizeInBits(); APInt Imm = cast(N1)->getAPIntValue(); if (Opc == ISD::AND) Imm ^= APInt::getAllOnesValue(BitWidth); if (Imm == 0 || Imm.isAllOnesValue()) return SDValue(); unsigned ShAmt = Imm.countTrailingZeros(); unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1; unsigned NewBW = NextPowerOf2(MSB - ShAmt); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); // The narrowing should be profitable, the load/store operation should be // legal (or custom) and the store size should be equal to the NewVT width. while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW || !TLI.isOperationLegalOrCustom(Opc, NewVT) || !TLI.isNarrowingProfitable(VT, NewVT))) { NewBW = NextPowerOf2(NewBW); NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); } if (NewBW >= BitWidth) return SDValue(); // If the lsb changed does not start at the type bitwidth boundary, // start at the previous one. if (ShAmt % NewBW) ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW; APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, std::min(BitWidth, ShAmt + NewBW)); if ((Imm & Mask) == Imm) { APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW); if (Opc == ISD::AND) NewImm ^= APInt::getAllOnesValue(NewBW); uint64_t PtrOff = ShAmt / 8; // For big endian targets, we need to adjust the offset to the pointer to // load the correct bytes. if (DAG.getDataLayout().isBigEndian()) PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff; unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff); Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext()); if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy)) return SDValue(); SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD), Ptr.getValueType(), Ptr, DAG.getConstant(PtrOff, SDLoc(LD), Ptr.getValueType())); SDValue NewLD = DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo()); SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, DAG.getConstant(NewImm, SDLoc(Value), NewVT)); SDValue NewST = DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, ST->getPointerInfo().getWithOffset(PtrOff), NewAlign); AddToWorklist(NewPtr.getNode()); AddToWorklist(NewLD.getNode()); AddToWorklist(NewVal.getNode()); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); ++OpsNarrowed; return NewST; } } return SDValue(); } /// For a given floating point load / store pair, if the load value isn't used /// by any other operations, then consider transforming the pair to integer /// load / store operations if the target deems the transformation profitable. SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) && Value.hasOneUse() && Chain == SDValue(Value.getNode(), 1)) { LoadSDNode *LD = cast(Value); EVT VT = LD->getMemoryVT(); if (!VT.isFloatingPoint() || VT != ST->getMemoryVT() || LD->isNonTemporal() || ST->isNonTemporal() || LD->getPointerInfo().getAddrSpace() != 0 || ST->getPointerInfo().getAddrSpace() != 0) return SDValue(); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); if (!TLI.isOperationLegal(ISD::LOAD, IntVT) || !TLI.isOperationLegal(ISD::STORE, IntVT) || !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) || !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT)) return SDValue(); unsigned LDAlign = LD->getAlignment(); unsigned STAlign = ST->getAlignment(); Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext()); unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy); if (LDAlign < ABIAlign || STAlign < ABIAlign) return SDValue(); SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LDAlign); SDValue NewST = DAG.getStore(NewLD.getValue(1), SDLoc(N), NewLD, ST->getBasePtr(), ST->getPointerInfo(), STAlign); AddToWorklist(NewLD.getNode()); AddToWorklist(NewST.getNode()); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1)); ++LdStFP2Int; return NewST; } return SDValue(); } // This is a helper function for visitMUL to check the profitability // of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). // MulNode is the original multiply, AddNode is (add x, c1), // and ConstNode is c2. // // If the (add x, c1) has multiple uses, we could increase // the number of adds if we make this transformation. // It would only be worth doing this if we can remove a // multiply in the process. Check for that here. // To illustrate: // (A + c1) * c3 // (A + c2) * c3 // We're checking for cases where we have common "c3 * A" expressions. bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, SDValue &AddNode, SDValue &ConstNode) { APInt Val; // If the add only has one use, this would be OK to do. if (AddNode.getNode()->hasOneUse()) return true; // Walk all the users of the constant with which we're multiplying. for (SDNode *Use : ConstNode->uses()) { if (Use == MulNode) // This use is the one we're on right now. Skip it. continue; if (Use->getOpcode() == ISD::MUL) { // We have another multiply use. SDNode *OtherOp; SDNode *MulVar = AddNode.getOperand(0).getNode(); // OtherOp is what we're multiplying against the constant. if (Use->getOperand(0) == ConstNode) OtherOp = Use->getOperand(1).getNode(); else OtherOp = Use->getOperand(0).getNode(); // Check to see if multiply is with the same operand of our "add". // // ConstNode = CONST // Use = ConstNode * A <-- visiting Use. OtherOp is A. // ... // AddNode = (A + c1) <-- MulVar is A. // = AddNode * ConstNode <-- current visiting instruction. // // If we make this transformation, we will have a common // multiply (ConstNode * A) that we can save. if (OtherOp == MulVar) return true; // Now check to see if a future expansion will give us a common // multiply. // // ConstNode = CONST // AddNode = (A + c1) // ... = AddNode * ConstNode <-- current visiting instruction. // ... // OtherOp = (A + c2) // Use = OtherOp * ConstNode <-- visiting Use. // // If we make this transformation, we will have a common // multiply (CONST * A) after we also do the same transformation // to the "t2" instruction. if (OtherOp->getOpcode() == ISD::ADD && DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) && OtherOp->getOperand(0).getNode() == MulVar) return true; } } // Didn't find a case where this would be profitable. return false; } static SDValue peekThroughBitcast(SDValue V) { while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); return V; } SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl &StoreNodes, unsigned NumStores) { SmallVector Chains; SmallPtrSet Visited; SDLoc StoreDL(StoreNodes[0].MemNode); for (unsigned i = 0; i < NumStores; ++i) { Visited.insert(StoreNodes[i].MemNode); } // don't include nodes that are children for (unsigned i = 0; i < NumStores; ++i) { if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0) Chains.push_back(StoreNodes[i].MemNode->getChain()); } assert(Chains.size() > 0 && "Chain should have generated a chain"); return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains); } bool DAGCombiner::MergeStoresOfConstantsOrVecElts( SmallVectorImpl &StoreNodes, EVT MemVT, unsigned NumStores, bool IsConstantSrc, bool UseVector, bool UseTrunc) { // Make sure we have something to merge. if (NumStores < 2) return false; // The latest Node in the DAG. SDLoc DL(StoreNodes[0].MemNode); int64_t ElementSizeBits = MemVT.getStoreSizeInBits(); unsigned SizeInBits = NumStores * ElementSizeBits; unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; EVT StoreTy; if (UseVector) { unsigned Elts = NumStores * NumMemElts; // Get the type for the merged vector store. StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); } else StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); SDValue StoredVal; if (UseVector) { if (IsConstantSrc) { SmallVector BuildVector; for (unsigned I = 0; I != NumStores; ++I) { StoreSDNode *St = cast(StoreNodes[I].MemNode); SDValue Val = St->getValue(); // If constant is of the wrong type, convert it now. if (MemVT != Val.getValueType()) { Val = peekThroughBitcast(Val); // Deal with constants of wrong size. if (ElementSizeBits != Val.getValueSizeInBits()) { EVT IntMemVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); if (isa(Val)) { // Not clear how to truncate FP values. return false; } else if (auto *C = dyn_cast(Val)) Val = DAG.getConstant(C->getAPIntValue() .zextOrTrunc(Val.getValueSizeInBits()) .zextOrTrunc(ElementSizeBits), SDLoc(C), IntMemVT); } // Make sure correctly size type is the correct type. Val = DAG.getBitcast(MemVT, Val); } BuildVector.push_back(Val); } StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL, StoreTy, BuildVector); } else { SmallVector Ops; for (unsigned i = 0; i < NumStores; ++i) { StoreSDNode *St = cast(StoreNodes[i].MemNode); SDValue Val = peekThroughBitcast(St->getValue()); // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of // type MemVT. If the underlying value is not the correct // type, but it is an extraction of an appropriate vector we // can recast Val to be of the correct type. This may require // converting between EXTRACT_VECTOR_ELT and // EXTRACT_SUBVECTOR. if ((MemVT != Val.getValueType()) && (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) { SDValue Vec = Val.getOperand(0); EVT MemVTScalarTy = MemVT.getScalarType(); // We may need to add a bitcast here to get types to line up. if (MemVTScalarTy != Vec.getValueType()) { unsigned Elts = Vec.getValueType().getSizeInBits() / MemVTScalarTy.getSizeInBits(); EVT NewVecTy = EVT::getVectorVT(*DAG.getContext(), MemVTScalarTy, Elts); Vec = DAG.getBitcast(NewVecTy, Vec); } auto OpC = (MemVT.isVector()) ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT; Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Val.getOperand(1)); } Ops.push_back(Val); } // Build the extracted vector elements back into a vector. StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL, StoreTy, Ops); } } else { // We should always use a vector store when merging extracted vector // elements, so this path implies a store of constants. assert(IsConstantSrc && "Merged vector elements should use vector store"); APInt StoreInt(SizeInBits, 0); // Construct a single integer constant which is made of the smaller // constant inputs. bool IsLE = DAG.getDataLayout().isLittleEndian(); for (unsigned i = 0; i < NumStores; ++i) { unsigned Idx = IsLE ? (NumStores - 1 - i) : i; StoreSDNode *St = cast(StoreNodes[Idx].MemNode); SDValue Val = St->getValue(); StoreInt <<= ElementSizeBits; if (ConstantSDNode *C = dyn_cast(Val)) { StoreInt |= C->getAPIntValue() .zextOrTrunc(ElementSizeBits) .zextOrTrunc(SizeInBits); } else if (ConstantFPSDNode *C = dyn_cast(Val)) { StoreInt |= C->getValueAPF() .bitcastToAPInt() .zextOrTrunc(ElementSizeBits) .zextOrTrunc(SizeInBits); // If fp truncation is necessary give up for now. if (MemVT.getSizeInBits() != ElementSizeBits) return false; } else { llvm_unreachable("Invalid constant element type"); } } // Create the new Load and Store operations. StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); } LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); // make sure we use trunc store if it's necessary to be legal. SDValue NewStore; if (!UseTrunc) { NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), FirstInChain->getAlignment()); } else { // Must be realized as a trunc store EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); unsigned LegalizedStoreSize = LegalizedStoredValueTy.getSizeInBits(); ConstantSDNode *C = cast(StoredVal); SDValue ExtendedStoreVal = DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL, LegalizedStoredValueTy); NewStore = DAG.getTruncStore( NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/, FirstInChain->getAlignment(), FirstInChain->getMemOperand()->getFlags()); } // Replace all merged stores with the new store. for (unsigned i = 0; i < NumStores; ++i) CombineTo(StoreNodes[i].MemNode, NewStore); AddToWorklist(NewChain.getNode()); return true; } void DAGCombiner::getStoreMergeCandidates( StoreSDNode *St, SmallVectorImpl &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); EVT MemVT = St->getMemoryVT(); SDValue Val = peekThroughBitcast(St->getValue()); // We must have a base and an offset. if (!BasePtr.getBase().getNode()) return; // Do not handle stores to undef base pointers. if (BasePtr.getBase().isUndef()) return; bool IsConstantSrc = isa(Val) || isa(Val); bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || Val.getOpcode() == ISD::EXTRACT_SUBVECTOR); bool IsLoadSrc = isa(Val); BaseIndexOffset LBasePtr; // Match on loadbaseptr if relevant. EVT LoadVT; if (IsLoadSrc) { auto *Ld = cast(Val); LBasePtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); LoadVT = Ld->getMemoryVT(); // Load and store should be the same type. if (MemVT != LoadVT) return; } auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { if (Other->isVolatile() || Other->isIndexed()) return false; SDValue Val = peekThroughBitcast(Other->getValue()); // Allow merging constants of different types as integers. bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) : Other->getMemoryVT() != MemVT; if (IsLoadSrc) { if (NoTypeMatch) return false; // The Load's Base Ptr must also match if (LoadSDNode *OtherLd = dyn_cast(Val)) { auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG); if (LoadVT != OtherLd->getMemoryVT()) return false; if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; } else return false; } if (IsConstantSrc) { if (NoTypeMatch) return false; if (!(isa(Val) || isa(Val))) return false; } if (IsExtractVecSrc) { // Do not merge truncated stores here. if (Other->isTruncatingStore()) return false; if (!MemVT.bitsEq(Val.getValueType())) return false; if (Val.getOpcode() != ISD::EXTRACT_VECTOR_ELT && Val.getOpcode() != ISD::EXTRACT_SUBVECTOR) return false; } Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); }; // We looking for a root node which is an ancestor to all mergable // stores. We search up through a load, to our root and then down // through all children. For instance we will find Store{1,2,3} if // St is Store1, Store2. or Store3 where the root is not a load // which always true for nonvolatile ops. TODO: Expand // the search to find all valid candidates through multiple layers of loads. // // Root // |-------|-------| // Load Load Store3 // | | // Store1 Store2 // // FIXME: We should be able to climb and // descend TokenFactors to find candidates as well. SDNode *RootNode = (St->getChain()).getNode(); if (LoadSDNode *Ldn = dyn_cast(RootNode)) { RootNode = Ldn->getChain().getNode(); for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) if (I.getOperandNo() == 0 && isa(*I)) // walk down chain for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) if (I2.getOperandNo() == 0) if (StoreSDNode *OtherST = dyn_cast(*I2)) { BaseIndexOffset Ptr; int64_t PtrDiff; if (CandidateMatch(OtherST, Ptr, PtrDiff)) StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); } } else for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) if (I.getOperandNo() == 0) if (StoreSDNode *OtherST = dyn_cast(*I)) { BaseIndexOffset Ptr; int64_t PtrDiff; if (CandidateMatch(OtherST, Ptr, PtrDiff)) StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); } } // We need to check that merging these stores does not cause a loop in // the DAG. Any store candidate may depend on another candidate // indirectly through its operand (we already consider dependencies // through the chain). Check in parallel by searching up from // non-chain operands of candidates. bool DAGCombiner::checkMergeStoreCandidatesForDependencies( SmallVectorImpl &StoreNodes, unsigned NumStores) { // FIXME: We should be able to truncate a full search of // predecessors by doing a BFS and keeping tabs the originating // stores from which worklist nodes come from in a similar way to // TokenFactor simplfication. SmallPtrSet Visited; SmallVector Worklist; unsigned int Max = 8192; // Search Ops of store candidates. for (unsigned i = 0; i < NumStores; ++i) { SDNode *n = StoreNodes[i].MemNode; // Potential loops may happen only through non-chain operands for (unsigned j = 1; j < n->getNumOperands(); ++j) Worklist.push_back(n->getOperand(j).getNode()); } // Search through DAG. We can stop early if we find a store node. for (unsigned i = 0; i < NumStores; ++i) { if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist, Max)) return false; // Check if we ended early, failing conservatively if so. if (Visited.size() >= Max) return false; } return true; } bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (OptLevel == CodeGenOpt::None) return false; EVT MemVT = St->getMemoryVT(); int64_t ElementSizeBytes = MemVT.getStoreSize(); unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) return false; bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); // This function cannot currently deal with non-byte-sized memory sizes. if (ElementSizeBytes * 8 != MemVT.getSizeInBits()) return false; if (!MemVT.isSimple()) return false; // Perform an early exit check. Do not bother looking at stored values that // are not constants, loads, or extracted vector elements. SDValue StoredVal = peekThroughBitcast(St->getValue()); bool IsLoadSrc = isa(StoredVal); bool IsConstantSrc = isa(StoredVal) || isa(StoredVal); bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) return false; SmallVector StoreNodes; // Find potential store merge candidates by searching through chain sub-DAG getStoreMergeCandidates(St, StoreNodes); // Check if there is anything to merge. if (StoreNodes.size() < 2) return false; // Sort the memory operands according to their distance from the // base pointer. std::sort(StoreNodes.begin(), StoreNodes.end(), [](MemOpLink LHS, MemOpLink RHS) { return LHS.OffsetFromBase < RHS.OffsetFromBase; }); // Store Merge attempts to merge the lowest stores. This generally // works out as if successful, as the remaining stores are checked // after the first collection of stores is merged. However, in the // case that a non-mergeable store is found first, e.g., {p[-2], // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent // mergeable cases. To prevent this, we prune such stores from the // front of StoreNodes here. bool RV = false; while (StoreNodes.size() > 1) { unsigned StartIdx = 0; while ((StartIdx + 1 < StoreNodes.size()) && StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes != StoreNodes[StartIdx + 1].OffsetFromBase) ++StartIdx; // Bail if we don't have enough candidates to merge. if (StartIdx + 1 >= StoreNodes.size()) return RV; if (StartIdx) StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx); // Scan the memory operations on the chain and find the first // non-consecutive store memory address. unsigned NumConsecutiveStores = 1; int64_t StartAddress = StoreNodes[0].OffsetFromBase; // Check that the addresses are consecutive starting from the second // element in the list of stores. for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) { int64_t CurrAddress = StoreNodes[i].OffsetFromBase; if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; NumConsecutiveStores = i + 1; } if (NumConsecutiveStores < 2) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumConsecutiveStores); continue; } // Check that we can merge these candidates without causing a cycle if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumConsecutiveStores)) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumConsecutiveStores); continue; } // The node with the lowest store address. LLVMContext &Context = *DAG.getContext(); const DataLayout &DL = DAG.getDataLayout(); // Store the constants into memory as one consecutive store. if (IsConstantSrc) { LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); unsigned FirstStoreAlign = FirstInChain->getAlignment(); unsigned LastLegalType = 1; unsigned LastLegalVectorType = 1; bool LastIntegerTrunc = false; bool NonZero = false; unsigned FirstZeroAfterNonZero = NumConsecutiveStores; for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *ST = cast(StoreNodes[i].MemNode); SDValue StoredVal = ST->getValue(); bool IsElementZero = false; if (ConstantSDNode *C = dyn_cast(StoredVal)) IsElementZero = C->isNullValue(); else if (ConstantFPSDNode *C = dyn_cast(StoredVal)) IsElementZero = C->getConstantFPValue()->isNullValue(); if (IsElementZero) { if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores) FirstZeroAfterNonZero = i; } NonZero |= !IsElementZero; // Find a legal type for the constant store. unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); bool IsFast = false; if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) { LastIntegerTrunc = false; LastLegalType = i + 1; // Or check whether a truncstore is legal. } else if (TLI.getTypeAction(Context, StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) { LastIntegerTrunc = true; LastLegalType = i + 1; } } // We only use vectors if the constant is known to be zero or the target // allows it and the function is not marked with the noimplicitfloat // attribute. if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && !NoVectors) { // Find a legal type for the vector store. unsigned Elts = (i + 1) * NumMemElts; EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) LastLegalVectorType = i + 1; } } bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; // Check if we found a legal integer type that creates a meaningful merge. if (NumElem < 2) { // We know that candidate stores are in order and of correct // shape. While there is no mergeable sequence from the // beginning one may start later in the sequence. The only // reason a merge of size N could have failed where another of // the same size would not have, is if the alignment has // improved or we've dropped a non-zero value. Drop as many // candidates as we can here. unsigned NumSkip = 1; while ( (NumSkip < NumConsecutiveStores) && (NumSkip < FirstZeroAfterNonZero) && (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) { NumSkip++; } StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); continue; } bool Merged = MergeStoresOfConstantsOrVecElts( StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc); RV |= Merged; // Remove merged stores for next iteration. StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); continue; } // When extracting multiple vector elements, try to store them // in one vector store rather than a sequence of scalar stores. if (IsExtractVecSrc) { LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); unsigned FirstStoreAlign = FirstInChain->getAlignment(); unsigned NumStoresToMerge = 1; for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *St = cast(StoreNodes[i].MemNode); SDValue StVal = peekThroughBitcast(St->getValue()); // This restriction could be loosened. // Bail out if any stored values are not elements extracted from a // vector. It should be possible to handle mixed sources, but load // sources need more careful handling (see the block of code below that // handles consecutive loads). if (StVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT && StVal.getOpcode() != ISD::EXTRACT_SUBVECTOR) return RV; // Find a legal type for the vector store. unsigned Elts = (i + 1) * NumMemElts; EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); bool IsFast; if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) NumStoresToMerge = i + 1; } // Check if we found a legal integer type that creates a meaningful merge. if (NumStoresToMerge < 2) { // We know that candidate stores are in order and of correct // shape. While there is no mergeable sequence from the // beginning one may start later in the sequence. The only // reason a merge of size N could have failed where another of // the same size would not have, is if the alignment has // improved. Drop as many candidates as we can here. unsigned NumSkip = 1; while ((NumSkip < NumConsecutiveStores) && (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) NumSkip++; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); continue; } bool Merged = MergeStoresOfConstantsOrVecElts( StoreNodes, MemVT, NumStoresToMerge, false, true, false); if (!Merged) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge); continue; } // Remove merged stores for next iteration. StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge); RV = true; continue; } // Below we handle the case of multiple consecutive stores that // come from multiple consecutive loads. We merge them into a single // wide load and a single wide store. // Look for load nodes which are used by the stored values. SmallVector LoadNodes; // Find acceptable loads. Loads need to have the same chain (token factor), // must not be zext, volatile, indexed, and they must be consecutive. BaseIndexOffset LdBasePtr; for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *St = cast(StoreNodes[i].MemNode); SDValue Val = peekThroughBitcast(St->getValue()); LoadSDNode *Ld = dyn_cast(Val); if (!Ld) break; // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) break; // The memory operands must not be volatile. if (Ld->isVolatile() || Ld->isIndexed()) break; // The stored memory type must be the same. if (Ld->getMemoryVT() != MemVT) break; BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); // If this is not the first ptr that we check. int64_t LdOffset = 0; if (LdBasePtr.getBase().getNode()) { // The base ptr must be the same. if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset)) break; } else { // Check that all other base pointers are the same as this one. LdBasePtr = LdPtr; } // We found a potential memory operand to merge. LoadNodes.push_back(MemOpLink(Ld, LdOffset)); } if (LoadNodes.size() < 2) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1); continue; } // If we have load/store pair instructions and we only have two values, // don't bother merging. unsigned RequiredAlignment; if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); continue; } LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); unsigned FirstStoreAlign = FirstInChain->getAlignment(); LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode); unsigned FirstLoadAS = FirstLoad->getAddressSpace(); unsigned FirstLoadAlign = FirstLoad->getAlignment(); // Scan the memory operations on the chain and find the first // non-consecutive load memory address. These variables hold the index in // the store node array. unsigned LastConsecutiveLoad = 1; // This variable refers to the size and not index in the array. unsigned LastLegalVectorType = 1; unsigned LastLegalIntegerType = 1; bool isDereferenceable = true; bool DoIntegerTruncate = false; StartAddress = LoadNodes[0].OffsetFromBase; SDValue FirstChain = FirstLoad->getChain(); for (unsigned i = 1; i < LoadNodes.size(); ++i) { // All loads must share the same chain. if (LoadNodes[i].MemNode->getChain() != FirstChain) break; int64_t CurrAddress = LoadNodes[i].OffsetFromBase; if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; LastConsecutiveLoad = i; if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable()) isDereferenceable = false; // Find a legal type for the vector store. unsigned Elts = (i + 1) * NumMemElts; EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); bool IsFastSt, IsFastLd; if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, FirstLoadAlign, &IsFastLd) && IsFastLd) { LastLegalVectorType = i + 1; } // Find a legal type for the integer store. unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; StoreTy = EVT::getIntegerVT(Context, SizeInBits); if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, FirstLoadAlign, &IsFastLd) && IsFastLd) { LastLegalIntegerType = i + 1; DoIntegerTruncate = false; // Or check whether a truncstore and extload is legal. } else if (TLI.getTypeAction(Context, StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) && TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, FirstLoadAlign, &IsFastLd) && IsFastLd) { LastLegalIntegerType = i + 1; DoIntegerTruncate = true; } } } // Only use vector types if the vector type is larger than the integer type. // If they are the same, use integers. bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors; unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType); // We add +1 here because the LastXXX variables refer to location while // the NumElem refers to array/index size. unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); NumElem = std::min(LastLegalType, NumElem); if (NumElem < 2) { // We know that candidate stores are in order and of correct // shape. While there is no mergeable sequence from the // beginning one may start later in the sequence. The only // reason a merge of size N could have failed where another of // the same size would not have is if the alignment or either // the load or store has improved. Drop as many candidates as we // can here. unsigned NumSkip = 1; while ((NumSkip < LoadNodes.size()) && (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) && (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) NumSkip++; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); continue; } // Find if it is better to use vectors or integers to load and store // to memory. EVT JointMemOpVT; if (UseVectorTy) { // Find a legal type for the vector store. unsigned Elts = NumElem * NumMemElts; JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); } else { unsigned SizeInBits = NumElem * ElementSizeBytes * 8; JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); } SDLoc LoadDL(LoadNodes[0].MemNode); SDLoc StoreDL(StoreNodes[0].MemNode); // The merged loads are required to have the same incoming chain, so // using the first's chain is acceptable. SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); AddToWorklist(NewStoreChain.getNode()); MachineMemOperand::Flags MMOFlags = isDereferenceable ? MachineMemOperand::MODereferenceable: MachineMemOperand::MONone; SDValue NewLoad, NewStore; if (UseVectorTy || !DoIntegerTruncate) { NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), FirstLoadAlign, MMOFlags); NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), FirstStoreAlign); } else { // This must be the truncstore/extload case EVT ExtendedTy = TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), JointMemOpVT, FirstLoadAlign, MMOFlags); NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), JointMemOpVT, FirstInChain->getAlignment(), FirstInChain->getMemOperand()->getFlags()); } // Transfer chain users from old loads to the new load. for (unsigned i = 0; i < NumElem; ++i) { LoadSDNode *Ld = cast(LoadNodes[i].MemNode); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); } // Replace the all stores with the new store. Recursively remove // corresponding value if its no longer used. for (unsigned i = 0; i < NumElem; ++i) { SDValue Val = StoreNodes[i].MemNode->getOperand(1); CombineTo(StoreNodes[i].MemNode, NewStore); if (Val.getNode()->use_empty()) recursivelyDeleteUnusedNodes(Val.getNode()); } RV = true; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); } return RV; } SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) { SDLoc SL(ST); SDValue ReplStore; // Replace the chain to avoid dependency. if (ST->isTruncatingStore()) { ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(), ST->getMemoryVT(), ST->getMemOperand()); } else { ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(), ST->getMemOperand()); } // Create token to keep both nodes around. SDValue Token = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, ST->getChain(), ReplStore); // Make sure the new and old chains are cleaned up. AddToWorklist(Token.getNode()); // Don't add users to work list. return CombineTo(ST, Token, false); } SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { SDValue Value = ST->getValue(); if (Value.getOpcode() == ISD::TargetConstantFP) return SDValue(); SDLoc DL(ST); SDValue Chain = ST->getChain(); SDValue Ptr = ST->getBasePtr(); const ConstantFPSDNode *CFP = cast(Value); // NOTE: If the original store is volatile, this transform must not increase // the number of stores. For example, on x86-32 an f64 can be stored in one // processor operation but an i64 (which is not legal) requires two. So the // transform should not be done in this case. SDValue Tmp; switch (CFP->getSimpleValueType(0).SimpleTy) { default: llvm_unreachable("Unknown FP type"); case MVT::f16: // We don't do this for these yet. case MVT::f80: case MVT::f128: case MVT::ppcf128: return SDValue(); case MVT::f32: if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { ; Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF(). bitcastToAPInt().getZExtValue(), SDLoc(CFP), MVT::i32); return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand()); } return SDValue(); case MVT::f64: if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) { ; Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). getZExtValue(), SDLoc(CFP), MVT::i64); return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand()); } if (!ST->isVolatile() && TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { // Many FP stores are not made apparent until after legalize, e.g. for // argument passing. Since this is so common, custom legalize the // 64-bit integer store into two 32-bit stores. uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32); SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32); if (DAG.getDataLayout().isBigEndian()) std::swap(Lo, Hi); unsigned Alignment = ST->getAlignment(); MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); AAMDNodes AAInfo = ST->getAAInfo(); SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), ST->getAlignment(), MMOFlags, AAInfo); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(4, DL, Ptr.getValueType())); Alignment = MinAlign(Alignment, 4U); SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(4), Alignment, MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1); } return SDValue(); } } SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); // If this is a store of a bit convert, store the input value if the // resultant store does not need a higher alignment than the original. if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() && ST->isUnindexed()) { EVT SVT = Value.getOperand(0).getValueType(); if (((!LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, SVT)) && TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) { unsigned OrigAlign = ST->getAlignment(); bool Fast = false; if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT, ST->getAddressSpace(), OrigAlign, &Fast) && Fast) { return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getPointerInfo(), OrigAlign, ST->getMemOperand()->getFlags(), ST->getAAInfo()); } } } // Turn 'store undef, Ptr' -> nothing. if (Value.isUndef() && ST->isUnindexed()) return Chain; // Try to infer better alignment information than the store already has. if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) { if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { if (Align > ST->getAlignment()) { SDValue NewStore = DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(), ST->getMemoryVT(), Align, ST->getMemOperand()->getFlags(), ST->getAAInfo()); if (NewStore.getNode() != N) return CombineTo(ST, NewStore, true); } } } // Try transforming a pair floating point load / store ops to integer // load / store ops. if (SDValue NewST = TransformFPLoadStorePair(N)) return NewST; if (ST->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes, on this store and any // adjacent stores. if (findBetterNeighborChains(ST)) { // replaceStoreChain uses CombineTo, which handled all of the worklist // manipulation. Return the original node to not do anything else. return SDValue(ST, 0); } Chain = ST->getChain(); } // FIXME: is there such a thing as a truncating indexed store? if (ST->isTruncatingStore() && ST->isUnindexed() && Value.getValueType().isInteger()) { // See if we can simplify the input to this truncstore with knowledge that // only the low bits are being used. For example: // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" SDValue Shorter = DAG.GetDemandedBits( Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), ST->getMemoryVT().getScalarSizeInBits())); AddToWorklist(Value.getNode()); if (Shorter.getNode()) return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), ST->getMemOperand()); // Otherwise, see if we can simplify the operation with // SimplifyDemandedBits, which only works if the value has a single use. if (SimplifyDemandedBits( Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), ST->getMemoryVT().getScalarSizeInBits()))) { // Re-visit the store if anything changed and the store hasn't been merged // with another node (N is deleted) SimplifyDemandedBits will add Value's // node back to the worklist if necessary, but we also need to re-visit // the Store node itself. if (N->getOpcode() != ISD::DELETED_NODE) AddToWorklist(N); return SDValue(N, 0); } } // If this is a load followed by a store to the same location, then the store // is dead/noop. if (LoadSDNode *Ld = dyn_cast(Value)) { if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() && ST->isUnindexed() && !ST->isVolatile() && // There can't be any side effects between the load and store, such as // a call or store. Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) { // The store is dead, remove it. return Chain; } } // Deal with elidable overlapping chained stores. if (StoreSDNode *ST1 = dyn_cast(Chain)) if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && ST1->isUnindexed() && !ST1->isVolatile() && ST1->hasOneUse() && !ST1->getBasePtr().isUndef() && !ST->isVolatile()) { BaseIndexOffset STBasePtr = BaseIndexOffset::match(ST->getBasePtr(), DAG); BaseIndexOffset ST1BasePtr = BaseIndexOffset::match(ST1->getBasePtr(), DAG); unsigned STBytes = ST->getMemoryVT().getStoreSize(); unsigned ST1Bytes = ST1->getMemoryVT().getStoreSize(); int64_t PtrDiff; // If this is a store who's preceeding store to a subset of the same // memory and no one other node is chained to that store we can // effectively drop the store. Do not remove stores to undef as they may // be used as data sinks. if (((ST->getBasePtr() == ST1->getBasePtr()) && (ST->getValue() == ST1->getValue())) || (STBasePtr.equalBaseIndex(ST1BasePtr, DAG, PtrDiff) && (0 <= PtrDiff) && (PtrDiff + ST1Bytes <= STBytes))) { CombineTo(ST1, ST1->getChain()); return SDValue(N, 0); } } // If this is an FP_ROUND or TRUNC followed by a store, fold this into a // truncating store. We can do this even if this is already a truncstore. if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE) && Value.getNode()->hasOneUse() && ST->isUnindexed() && TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), ST->getMemoryVT())) { return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getMemoryVT(), ST->getMemOperand()); } // Always perform this optimization before types are legal. If the target // prefers, also try this after legalization to catch stores that were created // by intrinsics or other nodes. if (!LegalTypes || (TLI.mergeStoresAfterLegalization())) { while (true) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so // or until we merge the last store on the chain. bool Changed = MergeConsecutiveStores(ST); if (!Changed) break; // Return N as merge only uses CombineTo and no worklist clean // up is necessary. if (N->getOpcode() == ISD::DELETED_NODE || !isa(N)) return SDValue(N, 0); } } // Try transforming N to an indexed store. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' // // Make sure to do this only after attempting to merge stores in order to // avoid changing the types of some subset of stores due to visit order, // preventing their merging. if (isa(ST->getValue())) { if (SDValue NewSt = replaceStoreOfFPConstant(ST)) return NewSt; } if (SDValue NewSt = splitMergedValStore(ST)) return NewSt; return ReduceLoadOpStoreWidth(N); } /// For the instruction sequence of store below, F and I values /// are bundled together as an i64 value before being stored into memory. /// Sometimes it is more efficent to generate separate stores for F and I, /// which can remove the bitwise instructions or sink them to colder places. /// /// (store (or (zext (bitcast F to i32) to i64), /// (shl (zext I to i64), 32)), addr) --> /// (store F, addr) and (store I, addr+4) /// /// Similarly, splitting for other merged store can also be beneficial, like: /// For pair of {i32, i32}, i64 store --> two i32 stores. /// For pair of {i32, i16}, i64 store --> two i32 stores. /// For pair of {i16, i16}, i32 store --> two i16 stores. /// For pair of {i16, i8}, i32 store --> two i16 stores. /// For pair of {i8, i8}, i16 store --> two i8 stores. /// /// We allow each target to determine specifically which kind of splitting is /// supported. /// /// The store patterns are commonly seen from the simple code snippet below /// if only std::make_pair(...) is sroa transformed before inlined into hoo. /// void goo(const std::pair &); /// hoo() { /// ... /// goo(std::make_pair(tmp, ftmp)); /// ... /// } /// SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { if (OptLevel == CodeGenOpt::None) return SDValue(); SDValue Val = ST->getValue(); SDLoc DL(ST); // Match OR operand. if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR) return SDValue(); // Match SHL operand and get Lower and Higher parts of Val. SDValue Op1 = Val.getOperand(0); SDValue Op2 = Val.getOperand(1); SDValue Lo, Hi; if (Op1.getOpcode() != ISD::SHL) { std::swap(Op1, Op2); if (Op1.getOpcode() != ISD::SHL) return SDValue(); } Lo = Op2; Hi = Op1.getOperand(0); if (!Op1.hasOneUse()) return SDValue(); // Match shift amount to HalfValBitSize. unsigned HalfValBitSize = Val.getValueSizeInBits() / 2; ConstantSDNode *ShAmt = dyn_cast(Op1.getOperand(1)); if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize) return SDValue(); // Lo and Hi are zero-extended from int with size less equal than 32 // to i64. if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() || !Lo.getOperand(0).getValueType().isScalarInteger() || Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize || Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() || !Hi.getOperand(0).getValueType().isScalarInteger() || Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize) return SDValue(); // Use the EVT of low and high parts before bitcast as the input // of target query. EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST) ? Lo.getOperand(0).getValueType() : Lo.getValueType(); EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST) ? Hi.getOperand(0).getValueType() : Hi.getValueType(); if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)) return SDValue(); // Start to split store. unsigned Alignment = ST->getAlignment(); MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); AAMDNodes AAInfo = ST->getAAInfo(); // Change the sizes of Lo and Hi's value types to HalfValBitSize. EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize); Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0)); Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0)); SDValue Chain = ST->getChain(); SDValue Ptr = ST->getBasePtr(); // Lower value store. SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), ST->getAlignment(), MMOFlags, AAInfo); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType())); // Higher value store. SDValue St1 = DAG.getStore(St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), Alignment / 2, MMOFlags, AAInfo); return St1; } /// Convert a disguised subvector insertion into a shuffle: /// insert_vector_elt V, (bitcast X from vector type), IdxC --> /// bitcast(shuffle (bitcast V), (extended X), Mask) /// Note: We do not use an insert_subvector node because that requires a legal /// subvector type. SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { SDValue InsertVal = N->getOperand(1); if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || !InsertVal.getOperand(0).getValueType().isVector()) return SDValue(); SDValue SubVec = InsertVal.getOperand(0); SDValue DestVec = N->getOperand(0); EVT SubVecVT = SubVec.getValueType(); EVT VT = DestVec.getValueType(); unsigned NumSrcElts = SubVecVT.getVectorNumElements(); unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); unsigned NumMaskVals = ExtendRatio * NumSrcElts; // Step 1: Create a shuffle mask that implements this insert operation. The // vector that we are inserting into will be operand 0 of the shuffle, so // those elements are just 'i'. The inserted subvector is in the first // positions of operand 1 of the shuffle. Example: // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7} SmallVector Mask(NumMaskVals); for (unsigned i = 0; i != NumMaskVals; ++i) { if (i / NumSrcElts == InsIndex) Mask[i] = (i % NumSrcElts) + NumMaskVals; else Mask[i] = i; } // Bail out if the target can not handle the shuffle we want to create. EVT SubVecEltVT = SubVecVT.getVectorElementType(); EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals); if (!TLI.isShuffleMaskLegal(Mask, ShufVT)) return SDValue(); // Step 2: Create a wide vector from the inserted source vector by appending // undefined elements. This is the same size as our destination vector. SDLoc DL(N); SmallVector ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT)); ConcatOps[0] = SubVec; SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps); // Step 3: Shuffle in the padded subvector. SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec); SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask); AddToWorklist(PaddedSubV.getNode()); AddToWorklist(DestVecBC.getNode()); AddToWorklist(Shuf.getNode()); return DAG.getBitcast(VT, Shuf); } SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); SDValue EltNo = N->getOperand(2); SDLoc DL(N); // If the inserted element is an UNDEF, just use the input vector. if (InVal.isUndef()) return InVec; EVT VT = InVec.getValueType(); // Remove redundant insertions: // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) return InVec; // We must know which element is being inserted for folds below here. auto *IndexC = dyn_cast(EltNo); if (!IndexC) return SDValue(); unsigned Elt = IndexC->getZExtValue(); if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) return Shuf; // Canonicalize insert_vector_elt dag nodes. // Example: // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1) // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0) // // Do this only if the child insert_vector node has one use; also // do this only if indices are both constants and Idx1 < Idx0. if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && isa(InVec.getOperand(2))) { unsigned OtherElt = InVec.getConstantOperandVal(2); if (Elt < OtherElt) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, InVec.getOperand(0), InVal, EltNo); AddToWorklist(NewOp.getNode()); return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()), VT, NewOp, InVec.getOperand(1), InVec.getOperand(2)); } } // If we can't generate a legal BUILD_VECTOR, exit if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) return SDValue(); // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the // vector elements. SmallVector Ops; // Do not combine these two vectors if the output vector will not replace // the input vector. if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) { Ops.append(InVec.getNode()->op_begin(), InVec.getNode()->op_end()); } else if (InVec.isUndef()) { unsigned NElts = VT.getVectorNumElements(); Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); } else { return SDValue(); } // Insert the element if (Elt < Ops.size()) { // All the operands of BUILD_VECTOR must have the same type; // we enforce that here. EVT OpVT = Ops[0].getValueType(); Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal; } // Return the new vector return DAG.getBuildVector(VT, DL, Ops); } SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) { assert(!OriginalLoad->isVolatile()); EVT ResultVT = EVE->getValueType(0); EVT VecEltVT = InVecVT.getVectorElementType(); unsigned Align = OriginalLoad->getAlignment(); unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( VecEltVT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) return SDValue(); ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD; if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) return SDValue(); Align = NewAlign; SDValue NewPtr = OriginalLoad->getBasePtr(); SDValue Offset; EVT PtrType = NewPtr.getValueType(); MachinePointerInfo MPI; SDLoc DL(EVE); if (auto *ConstEltNo = dyn_cast(EltNo)) { int Elt = ConstEltNo->getZExtValue(); unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; Offset = DAG.getConstant(PtrOff, DL, PtrType); MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); } else { Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType); Offset = DAG.getNode( ISD::MUL, DL, PtrType, Offset, DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType)); MPI = OriginalLoad->getPointerInfo(); } NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, NewPtr, Offset); // The replacement we need to do here is a little tricky: we need to // replace an extractelement of a load with a load. // Use ReplaceAllUsesOfValuesWith to do the replacement. // Note that this replacement assumes that the extractvalue is the only // use of the load; that's okay because we don't want to perform this // transformation in other cases anyway. SDValue Load; SDValue Chain; if (ResultVT.bitsGT(VecEltVT)) { // If the result type of vextract is wider than the load, then issue an // extending load instead. ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD; Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(), NewPtr, MPI, VecEltVT, Align, OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo()); Chain = Load.getValue(1); } else { Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, Align, OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo()); Chain = Load.getValue(1); if (ResultVT.bitsLT(VecEltVT)) Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load); else Load = DAG.getBitcast(ResultVT, Load); } WorklistRemover DeadNodes(*this); SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) }; SDValue To[] = { Load, Chain }; DAG.ReplaceAllUsesOfValuesWith(From, To, 2); // Since we're explicitly calling ReplaceAllUses, add the new node to the // worklist explicitly as well. AddToWorklist(Load.getNode()); AddUsersToWorklist(Load.getNode()); // Add users too // Make sure to revisit this node to clean it up; it will usually be dead. AddToWorklist(EVE); ++OpsNarrowed; return SDValue(EVE, 0); } SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // (vextract (scalar_to_vector val, 0) -> val SDValue InVec = N->getOperand(0); EVT VT = InVec.getValueType(); EVT NVT = N->getValueType(0); if (InVec.isUndef()) return DAG.getUNDEF(NVT); if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the // EXTRACT_VECTOR_ELT may widen the extracted vector. SDValue InOp = InVec.getOperand(0); if (InOp.getValueType() != NVT) { assert(InOp.getValueType().isInteger() && NVT.isInteger()); return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT); } return InOp; } SDValue EltNo = N->getOperand(1); ConstantSDNode *ConstEltNo = dyn_cast(EltNo); // extract_vector_elt (build_vector x, y), 1 -> y if (ConstEltNo && InVec.getOpcode() == ISD::BUILD_VECTOR && TLI.isTypeLegal(VT) && (InVec.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VT))) { SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue()); EVT InEltVT = Elt.getValueType(); // Sometimes build_vector's scalar input types do not match result type. if (NVT == InEltVT) return Elt; // TODO: It may be useful to truncate if free if the build_vector implicitly // converts. } // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x) bool isLE = DAG.getDataLayout().isLittleEndian(); unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1; if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() && ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) { SDValue BCSrc = InVec.getOperand(0); if (BCSrc.getValueType().isScalarInteger()) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc); } // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val // // This only really matters if the index is non-constant since other combines // on the constant elements already work. if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && EltNo == InVec.getOperand(2)) { SDValue Elt = InVec.getOperand(1); return VT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, SDLoc(N), NVT) : Elt; } // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. // We only perform this optimization before the op legalization phase because // we may introduce new vector instructions which are not backed by TD // patterns. For example on AVX, extracting elements from a wide vector // without using extract_subvector. However, if we can find an underlying // scalar value, then we can always use that. if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) { int NumElem = VT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast(InVec); // Find the new index to extract from. int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue()); // Extracting an undef index is undef. if (OrigElt == -1) return DAG.getUNDEF(NVT); // Select the right vector half to extract from. SDValue SVInVec; if (OrigElt < NumElem) { SVInVec = InVec->getOperand(0); } else { SVInVec = InVec->getOperand(1); OrigElt -= NumElem; } if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) { SDValue InOp = SVInVec.getOperand(OrigElt); if (InOp.getValueType() != NVT) { assert(InOp.getValueType().isInteger() && NVT.isInteger()); InOp = DAG.getSExtOrTrunc(InOp, SDLoc(SVInVec), NVT); } return InOp; } // FIXME: We should handle recursing on other vector shuffles and // scalar_to_vector here as well. if (!LegalOperations || // FIXME: Should really be just isOperationLegalOrCustom. TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) || TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) { EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout()); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec, DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy)); } } bool BCNumEltsChanged = false; EVT ExtVT = VT.getVectorElementType(); EVT LVT = ExtVT; // If the result of load has to be truncated, then it's not necessarily // profitable. if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT)) return SDValue(); if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType())) return SDValue(); if (VT.getVectorNumElements() != BCVT.getVectorNumElements()) BCNumEltsChanged = true; InVec = InVec.getOperand(0); ExtVT = BCVT.getVectorElementType(); } // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size) if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() && ISD::isNormalLoad(InVec.getNode()) && !N->getOperand(1)->hasPredecessor(InVec.getNode())) { SDValue Index = N->getOperand(1); if (LoadSDNode *OrigLoad = dyn_cast(InVec)) { if (!OrigLoad->isVolatile()) { return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index, OrigLoad); } } } // Perform only after legalization to ensure build_vector / vector_shuffle // optimizations have already been done. if (!LegalOperations) return SDValue(); // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size) // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size) // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr) if (ConstEltNo) { int Elt = cast(EltNo)->getZExtValue(); LoadSDNode *LN0 = nullptr; const ShuffleVectorSDNode *SVN = nullptr; if (ISD::isNormalLoad(InVec.getNode())) { LN0 = cast(InVec); } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR && InVec.getOperand(0).getValueType() == ExtVT && ISD::isNormalLoad(InVec.getOperand(0).getNode())) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); LN0 = cast(InVec.getOperand(0)); } else if ((SVN = dyn_cast(InVec))) { // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1) // => // (load $addr+1*size) // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); // If the bit convert changed the number of elements, it is unsafe // to examine the mask. if (BCNumEltsChanged) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = VT.getVectorNumElements(); int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt); InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); InVec = InVec.getOperand(0); } if (ISD::isNormalLoad(InVec.getNode())) { LN0 = cast(InVec); Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems; EltNo = DAG.getConstant(Elt, SDLoc(EltNo), EltNo.getValueType()); } } // Make sure we found a non-volatile load and the extractelement is // the only use. if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile()) return SDValue(); // If Idx was -1 above, Elt is going to be -1, so just return undef. if (Elt == -1) return DAG.getUNDEF(LVT); return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0); } return SDValue(); } // Simplify (build_vec (ext )) to (bitcast (build_vec )) SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { // We perform this optimization post type-legalization because // the type-legalizer often scalarizes integer-promoted vectors. // Performing this optimization before may create bit-casts which // will be type-legalized to complex code sequences. // We perform this optimization only before the operation legalizer because we // may introduce illegal operations. if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes) return SDValue(); unsigned NumInScalars = N->getNumOperands(); SDLoc DL(N); EVT VT = N->getValueType(0); // Check to see if this is a BUILD_VECTOR of a bunch of values // which come from any_extend or zero_extend nodes. If so, we can create // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR // optimizations. We do not handle sign-extend because we can't fill the sign // using shuffles. EVT SourceType = MVT::Other; bool AllAnyExt = true; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); // Ignore undef inputs. if (In.isUndef()) continue; bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND; bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND; // Abort if the element is not an extension. if (!ZeroExt && !AnyExt) { SourceType = MVT::Other; break; } // The input is a ZeroExt or AnyExt. Check the original type. EVT InTy = In.getOperand(0).getValueType(); // Check that all of the widened source types are the same. if (SourceType == MVT::Other) // First time. SourceType = InTy; else if (InTy != SourceType) { // Multiple income types. Abort. SourceType = MVT::Other; break; } // Check if all of the extends are ANY_EXTENDs. AllAnyExt &= AnyExt; } // In order to have valid types, all of the inputs must be extended from the // same source type and all of the inputs must be any or zero extend. // Scalar sizes must be a power of two. EVT OutScalarTy = VT.getScalarType(); bool ValidTypes = SourceType != MVT::Other && isPowerOf2_32(OutScalarTy.getSizeInBits()) && isPowerOf2_32(SourceType.getSizeInBits()); // Create a new simpler BUILD_VECTOR sequence which other optimizations can // turn into a single shuffle instruction. if (!ValidTypes) return SDValue(); bool isLE = DAG.getDataLayout().isLittleEndian(); unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits(); assert(ElemRatio > 1 && "Invalid element size ratio"); SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType): DAG.getConstant(0, DL, SourceType); unsigned NewBVElems = ElemRatio * VT.getVectorNumElements(); SmallVector Ops(NewBVElems, Filler); // Populate the new build_vector for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Cast = N->getOperand(i); assert((Cast.getOpcode() == ISD::ANY_EXTEND || Cast.getOpcode() == ISD::ZERO_EXTEND || Cast.isUndef()) && "Invalid cast opcode"); SDValue In; if (Cast.isUndef()) In = DAG.getUNDEF(SourceType); else In = Cast->getOperand(0); unsigned Index = isLE ? (i * ElemRatio) : (i * ElemRatio + (ElemRatio - 1)); assert(Index < Ops.size() && "Invalid index"); Ops[Index] = In; } // The type of the new BUILD_VECTOR node. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems); assert(VecVT.getSizeInBits() == VT.getSizeInBits() && "Invalid vector size"); // Check if the new vector type is legal. if (!isTypeLegal(VecVT)) return SDValue(); // Make the new BUILD_VECTOR. SDValue BV = DAG.getBuildVector(VecVT, DL, Ops); // The new BUILD_VECTOR node has the potential to be further optimized. AddToWorklist(BV.getNode()); // Bitcast to the desired type. return DAG.getBitcast(VT, BV); } SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumInScalars = N->getNumOperands(); SDLoc DL(N); EVT SrcVT = MVT::Other; unsigned Opcode = ISD::DELETED_NODE; unsigned NumDefs = 0; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); unsigned Opc = In.getOpcode(); if (Opc == ISD::UNDEF) continue; // If all scalar values are floats and converted from integers. if (Opcode == ISD::DELETED_NODE && (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) { Opcode = Opc; } if (Opc != Opcode) return SDValue(); EVT InVT = In.getOperand(0).getValueType(); // If all scalar values are typed differently, bail out. It's chosen to // simplify BUILD_VECTOR of integer types. if (SrcVT == MVT::Other) SrcVT = InVT; if (SrcVT != InVT) return SDValue(); NumDefs++; } // If the vector has just one element defined, it's not worth to fold it into // a vectorized one. if (NumDefs < 2) return SDValue(); assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP) && "Should only handle conversion from integer to float."); assert(SrcVT != MVT::Other && "Cannot determine source type!"); EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars); if (!TLI.isOperationLegalOrCustom(Opcode, NVT)) return SDValue(); // Just because the floating-point vector type is legal does not necessarily // mean that the corresponding integer vector type is. if (!isTypeLegal(NVT)) return SDValue(); SmallVector Opnds; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); if (In.isUndef()) Opnds.push_back(DAG.getUNDEF(SrcVT)); else Opnds.push_back(In.getOperand(0)); } SDValue BV = DAG.getBuildVector(NVT, DL, Opnds); AddToWorklist(BV.getNode()); return DAG.getNode(Opcode, DL, VT, BV); } SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, SDValue VecIn2, unsigned LeftIdx) { MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy); EVT VT = N->getValueType(0); EVT InVT1 = VecIn1.getValueType(); EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; unsigned Vec2Offset = 0; unsigned NumElems = VT.getVectorNumElements(); unsigned ShuffleNumElems = NumElems; // In case both the input vectors are extracted from same base // vector we do not need extra addend (Vec2Offset) while // computing shuffle mask. if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) || !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) || !(VecIn1.getOperand(0) == VecIn2.getOperand(0))) Vec2Offset = InVT1.getVectorNumElements(); // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match the type of the output. if (InVT1 != VT || InVT2 != VT) { if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) { // If the output vector length is a multiple of both input lengths, // we can concatenate them and pad the rest with undefs. unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits(); assert(NumConcats >= 2 && "Concat needs at least two inputs!"); SmallVector ConcatOps(NumConcats, DAG.getUNDEF(InVT1)); ConcatOps[0] = VecIn1; ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1); VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); VecIn2 = SDValue(); } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) { if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems)) return SDValue(); if (!VecIn2.getNode()) { // If we only have one input vector, and it's twice the size of the // output, split it in two. VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, DAG.getConstant(NumElems, DL, IdxTy)); VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx); // Since we now have shorter input vectors, adjust the offset of the // second vector's start. Vec2Offset = NumElems; } else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) { // VecIn1 is wider than the output, and we have another, possibly // smaller input. Pad the smaller input with undefs, shuffle at the // input vector width, and extract the output. // The shuffle type is different than VT, so check legality again. if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1)) return SDValue(); // Legalizing INSERT_SUBVECTOR is tricky - you basically have to // lower it back into a BUILD_VECTOR. So if the inserted type is // illegal, don't even try. if (InVT1 != InVT2) { if (!TLI.isTypeLegal(InVT2)) return SDValue(); VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1, DAG.getUNDEF(InVT1), VecIn2, ZeroIdx); } ShuffleNumElems = NumElems * 2; } else { // Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider // than VecIn1. We can't handle this for now - this case will disappear // when we start sorting the vectors by type. return SDValue(); } } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() && InVT1.getSizeInBits() == VT.getSizeInBits()) { SmallVector ConcatOps(2, DAG.getUNDEF(InVT2)); ConcatOps[0] = VecIn2; VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); } else { // TODO: Support cases where the length mismatch isn't exactly by a // factor of 2. // TODO: Move this check upwards, so that if we have bad type // mismatches, we don't create any DAG nodes. return SDValue(); } } // Initialize mask to undef. SmallVector Mask(ShuffleNumElems, -1); // Only need to run up to the number of elements actually used, not the // total number of elements in the shuffle - if we are shuffling a wider // vector, the high lanes should be set to undef. for (unsigned i = 0; i != NumElems; ++i) { if (VectorMask[i] <= 0) continue; unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1); if (VectorMask[i] == (int)LeftIdx) { Mask[i] = ExtIndex; } else if (VectorMask[i] == (int)LeftIdx + 1) { Mask[i] = Vec2Offset + ExtIndex; } } // The type the input vectors may have changed above. InVT1 = VecIn1.getValueType(); // If we already have a VecIn2, it should have the same type as VecIn1. // If we don't, get an undef/zero vector of the appropriate type. VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1); assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type."); SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask); if (ShuffleNumElems > NumElems) Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx); return Shuffle; } // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT // operations. If the types of the vectors we're extracting from allow it, // turn this into a vector_shuffle node. SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes. if (!isTypeLegal(VT)) return SDValue(); // May only combine to shuffle after legalize if shuffle is legal. if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT)) return SDValue(); bool UsesZeroVector = false; unsigned NumElems = N->getNumOperands(); // Record, for each element of the newly built vector, which input vector // that element comes from. -1 stands for undef, 0 for the zero vector, // and positive values for the input vectors. // VectorMask maps each element to its vector number, and VecIn maps vector // numbers to their initial SDValues. SmallVector VectorMask(NumElems, -1); SmallVector VecIn; VecIn.push_back(SDValue()); for (unsigned i = 0; i != NumElems; ++i) { SDValue Op = N->getOperand(i); if (Op.isUndef()) continue; // See if we can use a blend with a zero vector. // TODO: Should we generalize this to a blend with an arbitrary constant // vector? if (isNullConstant(Op) || isNullFPConstant(Op)) { UsesZeroVector = true; VectorMask[i] = 0; continue; } // Not an undef or zero. If the input is something other than an // EXTRACT_VECTOR_ELT with a constant index, bail out. if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op.getOperand(1))) return SDValue(); SDValue ExtractedFromVec = Op.getOperand(0); // All inputs must have the same element type as the output. if (VT.getVectorElementType() != ExtractedFromVec.getValueType().getVectorElementType()) return SDValue(); // Have we seen this input vector before? // The vectors are expected to be tiny (usually 1 or 2 elements), so using // a map back from SDValues to numbers isn't worth it. unsigned Idx = std::distance( VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec)); if (Idx == VecIn.size()) VecIn.push_back(ExtractedFromVec); VectorMask[i] = Idx; } // If we didn't find at least one input vector, bail out. if (VecIn.size() < 2) return SDValue(); // If all the Operands of BUILD_VECTOR extract from same // vector, then split the vector efficiently based on the maximum // vector access index and adjust the VectorMask and // VecIn accordingly. if (VecIn.size() == 2) { unsigned MaxIndex = 0; unsigned NearestPow2 = 0; SDValue Vec = VecIn.back(); EVT InVT = Vec.getValueType(); MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); SmallVector IndexVec(NumElems, 0); for (unsigned i = 0; i < NumElems; i++) { if (VectorMask[i] <= 0) continue; unsigned Index = N->getOperand(i).getConstantOperandVal(1); IndexVec[i] = Index; MaxIndex = std::max(MaxIndex, Index); } NearestPow2 = PowerOf2Ceil(MaxIndex); if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 && NumElems * 2 < NearestPow2) { unsigned SplitSize = NearestPow2 / 2; EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), SplitSize); if (TLI.isTypeLegal(SplitVT)) { SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, DAG.getConstant(SplitSize, DL, IdxTy)); SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, DAG.getConstant(0, DL, IdxTy)); VecIn.pop_back(); VecIn.push_back(VecIn1); VecIn.push_back(VecIn2); for (unsigned i = 0; i < NumElems; i++) { if (VectorMask[i] <= 0) continue; VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2; } } } } // TODO: We want to sort the vectors by descending length, so that adjacent // pairs have similar length, and the longer vector is always first in the // pair. // TODO: Should this fire if some of the input vectors has illegal type (like // it does now), or should we let legalization run its course first? // Shuffle phase: // Take pairs of vectors, and shuffle them so that the result has elements // from these vectors in the correct places. // For example, given: // t10: i32 = extract_vector_elt t1, Constant:i64<0> // t11: i32 = extract_vector_elt t2, Constant:i64<0> // t12: i32 = extract_vector_elt t3, Constant:i64<0> // t13: i32 = extract_vector_elt t1, Constant:i64<1> // t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13 // We will generate: // t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2 // t21: v4i32 = vector_shuffle t3, undef SmallVector Shuffles; for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) { unsigned LeftIdx = 2 * In + 1; SDValue VecLeft = VecIn[LeftIdx]; SDValue VecRight = (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue(); if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft, VecRight, LeftIdx)) Shuffles.push_back(Shuffle); else return SDValue(); } // If we need the zero vector as an "ingredient" in the blend tree, add it // to the list of shuffles. if (UsesZeroVector) Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT)); // If we only have one shuffle, we're done. if (Shuffles.size() == 1) return Shuffles[0]; // Update the vector mask to point to the post-shuffle vectors. for (int &Vec : VectorMask) if (Vec == 0) Vec = Shuffles.size() - 1; else Vec = (Vec - 1) / 2; // More than one shuffle. Generate a binary tree of blends, e.g. if from // the previous step we got the set of shuffles t10, t11, t12, t13, we will // generate: // t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2 // t11: v8i32 = vector_shuffle t3, t4 // t12: v8i32 = vector_shuffle t5, t6 // t13: v8i32 = vector_shuffle t7, t8 // t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11 // t21: v8i32 = vector_shuffle t12, t13 // t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21 // Make sure the initial size of the shuffle list is even. if (Shuffles.size() % 2) Shuffles.push_back(DAG.getUNDEF(VT)); for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) { if (CurSize % 2) { Shuffles[CurSize] = DAG.getUNDEF(VT); CurSize++; } for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) { int Left = 2 * In; int Right = 2 * In + 1; SmallVector Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { if (VectorMask[i] == Left) { Mask[i] = i; VectorMask[i] = In; } else if (VectorMask[i] == Right) { Mask[i] = i + NumElems; VectorMask[i] = In; } } Shuffles[In] = DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask); } } return Shuffles[0]; } SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { EVT VT = N->getValueType(0); // A vector built entirely of undefs is undef. if (ISD::allOperandsUndef(N)) return DAG.getUNDEF(VT); // Check if we can express BUILD VECTOR via subvector extract. if (!LegalTypes && (N->getNumOperands() > 1)) { SDValue Op0 = N->getOperand(0); auto checkElem = [&](SDValue Op) -> uint64_t { if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) && (Op0.getOperand(0) == Op.getOperand(0))) if (auto CNode = dyn_cast(Op.getOperand(1))) return CNode->getZExtValue(); return -1; }; int Offset = checkElem(Op0); for (unsigned i = 0; i < N->getNumOperands(); ++i) { if (Offset + i != checkElem(N->getOperand(i))) { Offset = -1; break; } } if ((Offset == 0) && (Op0.getOperand(0).getValueType() == N->getValueType(0))) return Op0.getOperand(0); if ((Offset != -1) && ((Offset % N->getValueType(0).getVectorNumElements()) == 0)) // IDX must be multiple of output size. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), Op0.getOperand(0), Op0.getOperand(1)); } if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) return V; if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N)) return V; if (SDValue V = reduceBuildVecToShuffle(N)) return V; return SDValue(); } static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT OpVT = N->getOperand(0).getValueType(); // If the operands are legal vectors, leave them alone. if (TLI.isTypeLegal(OpVT)) return SDValue(); SDLoc DL(N); EVT VT = N->getValueType(0); SmallVector Ops; EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits()); SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); // Keep track of what we encounter. bool AnyInteger = false; bool AnyFP = false; for (const SDValue &Op : N->ops()) { if (ISD::BITCAST == Op.getOpcode() && !Op.getOperand(0).getValueType().isVector()) Ops.push_back(Op.getOperand(0)); else if (ISD::UNDEF == Op.getOpcode()) Ops.push_back(ScalarUndef); else return SDValue(); // Note whether we encounter an integer or floating point scalar. // If it's neither, bail out, it could be something weird like x86mmx. EVT LastOpVT = Ops.back().getValueType(); if (LastOpVT.isFloatingPoint()) AnyFP = true; else if (LastOpVT.isInteger()) AnyInteger = true; else return SDValue(); } // If any of the operands is a floating point scalar bitcast to a vector, // use floating point types throughout, and bitcast everything. // Replace UNDEFs by another scalar UNDEF node, of the final desired type. if (AnyFP) { SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits()); ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); if (AnyInteger) { for (SDValue &Op : Ops) { if (Op.getValueType() == SVT) continue; if (Op.isUndef()) Op = ScalarUndef; else Op = DAG.getBitcast(SVT, Op); } } } EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT, VT.getSizeInBits() / SVT.getSizeInBits()); return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops)); } // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at // most two distinct vectors the same size as the result, attempt to turn this // into a legal shuffle. static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); EVT OpVT = N->getOperand(0).getValueType(); int NumElts = VT.getVectorNumElements(); int NumOpElts = OpVT.getVectorNumElements(); SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT); SmallVector Mask; for (SDValue Op : N->ops()) { // Peek through any bitcast. Op = peekThroughBitcast(Op); // UNDEF nodes convert to UNDEF shuffle mask values. if (Op.isUndef()) { Mask.append((unsigned)NumOpElts, -1); continue; } if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) return SDValue(); // What vector are we extracting the subvector from and at what index? SDValue ExtVec = Op.getOperand(0); // We want the EVT of the original extraction to correctly scale the // extraction index. EVT ExtVT = ExtVec.getValueType(); // Peek through any bitcast. ExtVec = peekThroughBitcast(ExtVec); // UNDEF nodes convert to UNDEF shuffle mask values. if (ExtVec.isUndef()) { Mask.append((unsigned)NumOpElts, -1); continue; } if (!isa(Op.getOperand(1))) return SDValue(); int ExtIdx = Op.getConstantOperandVal(1); // Ensure that we are extracting a subvector from a vector the same // size as the result. if (ExtVT.getSizeInBits() != VT.getSizeInBits()) return SDValue(); // Scale the subvector index to account for any bitcast. int NumExtElts = ExtVT.getVectorNumElements(); if (0 == (NumExtElts % NumElts)) ExtIdx /= (NumExtElts / NumElts); else if (0 == (NumElts % NumExtElts)) ExtIdx *= (NumElts / NumExtElts); else return SDValue(); // At most we can reference 2 inputs in the final shuffle. if (SV0.isUndef() || SV0 == ExtVec) { SV0 = ExtVec; for (int i = 0; i != NumOpElts; ++i) Mask.push_back(i + ExtIdx); } else if (SV1.isUndef() || SV1 == ExtVec) { SV1 = ExtVec; for (int i = 0; i != NumOpElts; ++i) Mask.push_back(i + ExtIdx + NumElts); } else { return SDValue(); } } if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT)) return SDValue(); return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0), DAG.getBitcast(VT, SV1), Mask); } SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // If we only have one input vector, we don't need to do any concatenation. if (N->getNumOperands() == 1) return N->getOperand(0); // Check if all of the operands are undefs. EVT VT = N->getValueType(0); if (ISD::allOperandsUndef(N)) return DAG.getUNDEF(VT); // Optimize concat_vectors where all but the first of the vectors are undef. if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) { return Op.isUndef(); })) { SDValue In = N->getOperand(0); assert(In.getValueType().isVector() && "Must concat vectors"); // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr). if (In->getOpcode() == ISD::BITCAST && !In->getOperand(0).getValueType().isVector()) { SDValue Scalar = In->getOperand(0); // If the bitcast type isn't legal, it might be a trunc of a legal type; // look through the trunc so we can still do the transform: // concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar) if (Scalar->getOpcode() == ISD::TRUNCATE && !TLI.isTypeLegal(Scalar.getValueType()) && TLI.isTypeLegal(Scalar->getOperand(0).getValueType())) Scalar = Scalar->getOperand(0); EVT SclTy = Scalar->getValueType(0); if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) return SDValue(); unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits(); if (VNTNumElms < 2) return SDValue(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms); if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType())) return SDValue(); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar); return DAG.getBitcast(VT, Res); } } // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR. // We have already tested above for an UNDEF only concatenation. // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) // -> (BUILD_VECTOR A, B, ..., C, D, ...) auto IsBuildVectorOrUndef = [](const SDValue &Op) { return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode(); }; if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) { SmallVector Opnds; EVT SVT = VT.getScalarType(); EVT MinVT = SVT; if (!SVT.isFloatingPoint()) { // If BUILD_VECTOR are from built from integer, they may have different // operand types. Get the smallest type and truncate all operands to it. bool FoundMinVT = false; for (const SDValue &Op : N->ops()) if (ISD::BUILD_VECTOR == Op.getOpcode()) { EVT OpSVT = Op.getOperand(0).getValueType(); MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT; FoundMinVT = true; } assert(FoundMinVT && "Concat vector type mismatch"); } for (const SDValue &Op : N->ops()) { EVT OpVT = Op.getValueType(); unsigned NumElts = OpVT.getVectorNumElements(); if (ISD::UNDEF == Op.getOpcode()) Opnds.append(NumElts, DAG.getUNDEF(MinVT)); if (ISD::BUILD_VECTOR == Op.getOpcode()) { if (SVT.isFloatingPoint()) { assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch"); Opnds.append(Op->op_begin(), Op->op_begin() + NumElts); } else { for (unsigned i = 0; i != NumElts; ++i) Opnds.push_back( DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i))); } } } assert(VT.getVectorNumElements() == Opnds.size() && "Concat vector type mismatch"); return DAG.getBuildVector(VT, SDLoc(N), Opnds); } // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR. if (SDValue V = combineConcatVectorOfScalars(N, DAG)) return V; // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) return V; // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR // nodes often generate nop CONCAT_VECTOR nodes. // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that // place the incoming vectors at the exact same location. SDValue SingleSource = SDValue(); unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements(); for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Op = N->getOperand(i); if (Op.isUndef()) continue; // Check if this is the identity extract: if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) return SDValue(); // Find the single incoming vector for the extract_subvector. if (SingleSource.getNode()) { if (Op.getOperand(0) != SingleSource) return SDValue(); } else { SingleSource = Op.getOperand(0); // Check the source type is the same as the type of the result. // If not, this concat may extend the vector, so we can not // optimize it away. if (SingleSource.getValueType() != N->getValueType(0)) return SDValue(); } unsigned IdentityIndex = i * PartNumElem; ConstantSDNode *CS = dyn_cast(Op.getOperand(1)); // The extract index must be constant. if (!CS) return SDValue(); // Check that we are reading from the identity index. if (CS->getZExtValue() != IdentityIndex) return SDValue(); } if (SingleSource.getNode()) return SingleSource; return SDValue(); } /// If we are extracting a subvector produced by a wide binary operator with at /// at least one operand that was the result of a vector concatenation, then try /// to use the narrow vector operands directly to avoid the concatenation and /// extraction. static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share // some of these bailouts with other transforms. // The extract index must be a constant, so we can map it to a concat operand. auto *ExtractIndex = dyn_cast(Extract->getOperand(1)); if (!ExtractIndex) return SDValue(); // Only handle the case where we are doubling and then halving. A larger ratio // may require more than two narrow binops to replace the wide binop. EVT VT = Extract->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); assert((ExtractIndex->getZExtValue() % NumElems) == 0 && "Extract index is not a multiple of the vector length."); if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2) return SDValue(); // We are looking for an optionally bitcasted wide vector binary operator // feeding an extract subvector. SDValue BinOp = peekThroughBitcast(Extract->getOperand(0)); // TODO: The motivating case for this transform is an x86 AVX1 target. That // target has temptingly almost legal versions of bitwise logic ops in 256-bit // flavors, but no other 256-bit integer support. This could be extended to // handle any binop, but that may require fixing/adding other folds to avoid // codegen regressions. unsigned BOpcode = BinOp.getOpcode(); if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR) return SDValue(); // The binop must be a vector type, so we can chop it in half. EVT WideBVT = BinOp.getValueType(); if (!WideBVT.isVector()) return SDValue(); // Bail out if the target does not support a narrower version of the binop. EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), WideBVT.getVectorNumElements() / 2); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) return SDValue(); // Peek through bitcasts of the binary operator operands if needed. SDValue LHS = peekThroughBitcast(BinOp.getOperand(0)); SDValue RHS = peekThroughBitcast(BinOp.getOperand(1)); // We need at least one concatenation operation of a binop operand to make // this transform worthwhile. The concat must double the input vector sizes. // TODO: Should we also handle INSERT_SUBVECTOR patterns? bool ConcatL = LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2; bool ConcatR = RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2; if (!ConcatL && !ConcatR) return SDValue(); // If one of the binop operands was not the result of a concat, we must // extract a half-sized operand for our new narrow binop. We can't just reuse // the original extract index operand because we may have bitcasted. unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems; unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); SDLoc DL(Extract); // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N) // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum)) : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, BinOp.getOperand(0), DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum)) : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, BinOp.getOperand(1), DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); return DAG.getBitcast(VT, NarrowBinOp); } /// If we are extracting a subvector from a wide vector load, convert to a /// narrow load to eliminate the extraction: /// (extract_subvector (load wide vector)) --> (load narrow vector) static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { // TODO: Add support for big-endian. The offset calculation must be adjusted. if (DAG.getDataLayout().isBigEndian()) return SDValue(); // TODO: The one-use check is overly conservative. Check the cost of the // extract instead or remove that condition entirely. auto *Ld = dyn_cast(Extract->getOperand(0)); auto *ExtIdx = dyn_cast(Extract->getOperand(1)); if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() || !ExtIdx) return SDValue(); // The narrow load will be offset from the base address of the old load if // we are extracting from something besides index 0 (little-endian). EVT VT = Extract->getValueType(0); SDLoc DL(Extract); SDValue BaseAddr = Ld->getOperand(1); unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); // TODO: Use "BaseIndexOffset" to make this more effective. SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, VT.getStoreSize()); SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); DAG.makeEquivalentMemoryOrdering(Ld, NewLd); return NewLd; } SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); // Extract from UNDEF is UNDEF. if (V.isUndef()) return DAG.getUNDEF(NVT); if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT)) if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) return NarrowLoad; // Combine: // (extract_subvec (concat V1, V2, ...), i) // Into: // Vi if possible // Only operand 0 is checked as 'concat' assumes all inputs of the same // type. if (V->getOpcode() == ISD::CONCAT_VECTORS && isa(N->getOperand(1)) && V->getOperand(0).getValueType() == NVT) { unsigned Idx = N->getConstantOperandVal(1); unsigned NumElems = NVT.getVectorNumElements(); assert((Idx % NumElems) == 0 && "IDX in concat is not a multiple of the result vector length."); return V->getOperand(Idx / NumElems); } // Skip bitcasting V = peekThroughBitcast(V); // If the input is a build vector. Try to make a smaller build vector. if (V->getOpcode() == ISD::BUILD_VECTOR) { if (auto *Idx = dyn_cast(N->getOperand(1))) { EVT InVT = V->getValueType(0); unsigned ExtractSize = NVT.getSizeInBits(); unsigned EltSize = InVT.getScalarSizeInBits(); // Only do this if we won't split any elements. if (ExtractSize % EltSize == 0) { unsigned NumElems = ExtractSize / EltSize; EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), NumElems); if ((!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT)) && (!LegalTypes || TLI.isTypeLegal(ExtractVT))) { unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) / EltSize; // Extract the pieces from the original build_vector. SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N), makeArrayRef(V->op_begin() + IdxVal, NumElems)); return DAG.getBitcast(NVT, BuildVec); } } } } if (V->getOpcode() == ISD::INSERT_SUBVECTOR) { // Handle only simple case where vector being inserted and vector // being extracted are of same size. EVT SmallVT = V->getOperand(1).getValueType(); if (!NVT.bitsEq(SmallVT)) return SDValue(); // Only handle cases where both indexes are constants. ConstantSDNode *ExtIdx = dyn_cast(N->getOperand(1)); ConstantSDNode *InsIdx = dyn_cast(V->getOperand(2)); if (InsIdx && ExtIdx) { // Combine: // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) // Into: // indices are equal or bit offsets are equal => V1 // otherwise => (extract_subvec V1, ExtIdx) if (InsIdx->getZExtValue() * SmallVT.getScalarSizeInBits() == ExtIdx->getZExtValue() * NVT.getScalarSizeInBits()) return DAG.getBitcast(NVT, V->getOperand(1)); return DAG.getNode( ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, DAG.getBitcast(N->getOperand(0).getValueType(), V->getOperand(0)), N->getOperand(1)); } } if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG)) return NarrowBOp; return SDValue(); } static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements, SDValue V, SelectionDAG &DAG) { SDLoc DL(V); EVT VT = V.getValueType(); switch (V.getOpcode()) { default: return V; case ISD::CONCAT_VECTORS: { EVT OpVT = V->getOperand(0).getValueType(); int OpSize = OpVT.getVectorNumElements(); SmallBitVector OpUsedElements(OpSize, false); bool FoundSimplification = false; SmallVector NewOps; NewOps.reserve(V->getNumOperands()); for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) { SDValue Op = V->getOperand(i); bool OpUsed = false; for (int j = 0; j < OpSize; ++j) if (UsedElements[i * OpSize + j]) { OpUsedElements[j] = true; OpUsed = true; } NewOps.push_back( OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG) : DAG.getUNDEF(OpVT)); FoundSimplification |= Op == NewOps.back(); OpUsedElements.reset(); } if (FoundSimplification) V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps); return V; } case ISD::INSERT_SUBVECTOR: { SDValue BaseV = V->getOperand(0); SDValue SubV = V->getOperand(1); auto *IdxN = dyn_cast(V->getOperand(2)); if (!IdxN) return V; int SubSize = SubV.getValueType().getVectorNumElements(); int Idx = IdxN->getZExtValue(); bool SubVectorUsed = false; SmallBitVector SubUsedElements(SubSize, false); for (int i = 0; i < SubSize; ++i) if (UsedElements[i + Idx]) { SubVectorUsed = true; SubUsedElements[i] = true; UsedElements[i + Idx] = false; } // Now recurse on both the base and sub vectors. SDValue SimplifiedSubV = SubVectorUsed ? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG) : DAG.getUNDEF(SubV.getValueType()); SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG); if (SimplifiedSubV != SubV || SimplifiedBaseV != BaseV) V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, SimplifiedBaseV, SimplifiedSubV, V->getOperand(2)); return V; } } } static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0, SDValue N1, SelectionDAG &DAG) { EVT VT = SVN->getValueType(0); int NumElts = VT.getVectorNumElements(); SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false); for (int M : SVN->getMask()) if (M >= 0 && M < NumElts) N0UsedElements[M] = true; else if (M >= NumElts) N1UsedElements[M - NumElts] = true; SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG); SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG); if (S0 == N0 && S1 == N1) return SDValue(); return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask()); } static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0, SDValue N1, SelectionDAG &DAG) { auto isUndefElt = [](SDValue V, int Idx) { // TODO - handle more cases as required. if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Idx).isUndef(); if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Idx != 0) || V.getOperand(0).isUndef(); return false; }; EVT VT = SVN->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); bool Changed = false; SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if ((0 <= Idx && Idx < (int)NumElts && isUndefElt(N0, Idx)) || ((int)NumElts < Idx && isUndefElt(N1, Idx - NumElts))) { Changed = true; Idx = -1; } NewMask.push_back(Idx); } if (Changed) return DAG.getVectorShuffle(VT, SDLoc(SVN), N0, N1, NewMask); return SDValue(); } // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, // or turn a shuffle of a single concat into simpler shuffle then concat. static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ShuffleVectorSDNode *SVN = cast(N); SmallVector Ops; EVT ConcatVT = N0.getOperand(0).getValueType(); unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); unsigned NumConcats = NumElts / NumElemsPerConcat; // Special case: shuffle(concat(A,B)) can be more efficiently represented // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high // half vector elements. if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() && std::all_of(SVN->getMask().begin() + NumElemsPerConcat, SVN->getMask().end(), [](int i) { return i == -1; })) { N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1), makeArrayRef(SVN->getMask().begin(), NumElemsPerConcat)); N1 = DAG.getUNDEF(ConcatVT); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); } // Look at every vector that's inserted. We're looking for exact // subvector-sized copies from a concatenated vector for (unsigned I = 0; I != NumConcats; ++I) { // Make sure we're dealing with a copy. unsigned Begin = I * NumElemsPerConcat; bool AllUndef = true, NoUndef = true; for (unsigned J = Begin; J != Begin + NumElemsPerConcat; ++J) { if (SVN->getMaskElt(J) >= 0) AllUndef = false; else NoUndef = false; } if (NoUndef) { if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0) return SDValue(); for (unsigned J = 1; J != NumElemsPerConcat; ++J) if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J)) return SDValue(); unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat; if (FirstElt < N0.getNumOperands()) Ops.push_back(N0.getOperand(FirstElt)); else Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands())); } else if (AllUndef) { Ops.push_back(DAG.getUNDEF(N0.getOperand(0).getValueType())); } else { // Mixed with general masks and undefs, can't do optimization. return SDValue(); } } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. // // SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always // a simplification in some sense, but it isn't appropriate in general: some // BUILD_VECTORs are substantially cheaper than others. The general case // of a BUILD_VECTOR requires inserting each element individually (or // performing the equivalent in a temporary stack variable). A BUILD_VECTOR of // all constants is a single constant pool load. A BUILD_VECTOR where each // element is identical is a splat. A BUILD_VECTOR where most of the operands // are undef lowers to a small number of element insertions. // // To deal with this, we currently use a bunch of mostly arbitrary heuristics. // We don't fold shuffles where one side is a non-zero constant, and we don't // fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate // non-constant operands. This seems to work out reasonably well in practice. static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, const TargetLowering &TLI) { EVT VT = SVN->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = SVN->getOperand(0); SDValue N1 = SVN->getOperand(1); if (!N0->hasOneUse() || !N1->hasOneUse()) return SDValue(); // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as // discussed above. if (!N1.isUndef()) { bool N0AnyConst = isAnyConstantBuildVector(N0.getNode()); bool N1AnyConst = isAnyConstantBuildVector(N1.getNode()); if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode())) return SDValue(); if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode())) return SDValue(); } // If both inputs are splats of the same value then we can safely merge this // to a single BUILD_VECTOR with undef elements based on the shuffle mask. bool IsSplat = false; auto *BV0 = dyn_cast(N0); auto *BV1 = dyn_cast(N1); if (BV0 && BV1) if (SDValue Splat0 = BV0->getSplatValue()) IsSplat = (Splat0 == BV1->getSplatValue()); SmallVector Ops; SmallSet DuplicateOps; for (int M : SVN->getMask()) { SDValue Op = DAG.getUNDEF(VT.getScalarType()); if (M >= 0) { int Idx = M < (int)NumElts ? M : M - NumElts; SDValue &S = (M < (int)NumElts ? N0 : N1); if (S.getOpcode() == ISD::BUILD_VECTOR) { Op = S.getOperand(Idx); } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) { assert(Idx == 0 && "Unexpected SCALAR_TO_VECTOR operand index."); Op = S.getOperand(0); } else { // Operand can't be combined - bail out. return SDValue(); } } // Don't duplicate a non-constant BUILD_VECTOR operand unless we're // generating a splat; semantically, this is fine, but it's likely to // generate low-quality code if the target can't reconstruct an appropriate // shuffle. if (!Op.isUndef() && !isa(Op) && !isa(Op)) if (!IsSplat && !DuplicateOps.insert(Op).second) return SDValue(); Ops.push_back(Op); } // BUILD_VECTOR requires all inputs to be of the same type, find the // maximum type and extend them all. EVT SVT = VT.getScalarType(); if (SVT.isInteger()) for (SDValue &Op : Ops) SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); if (SVT != VT.getScalarType()) for (SDValue &Op : Ops) Op = TLI.isZExtFree(Op.getValueType(), SVT) ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT) : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT); return DAG.getBuildVector(VT, SDLoc(SVN), Ops); } // Match shuffles that can be converted to any_vector_extend_in_reg. // This is often generated during legalization. // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, const TargetLowering &TLI, bool LegalOperations, bool LegalTypes) { EVT VT = SVN->getValueType(0); bool IsBigEndian = DAG.getDataLayout().isBigEndian(); // TODO Add support for big-endian when we have a test case. if (!VT.isInteger() || IsBigEndian) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); ArrayRef Mask = SVN->getMask(); SDValue N0 = SVN->getOperand(0); // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32)) auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) { for (unsigned i = 0; i != NumElts; ++i) { if (Mask[i] < 0) continue; if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale)) continue; return false; } return true; }; // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for // power-of-2 extensions as they are the most likely. for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { // Check for non power of 2 vector sizes if (NumElts % Scale != 0) continue; if (!isAnyExtend(Scale)) continue; EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); if (!LegalTypes || TLI.isTypeLegal(OutVT)) if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) return DAG.getBitcast(VT, DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT)); } return SDValue(); } // Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of // each source element of a large type into the lowest elements of a smaller // destination type. This is often generated during legalization. // If the source node itself was a '*_extend_vector_inreg' node then we should // then be able to remove it. static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) { EVT VT = SVN->getValueType(0); bool IsBigEndian = DAG.getDataLayout().isBigEndian(); // TODO Add support for big-endian when we have a test case. if (!VT.isInteger() || IsBigEndian) return SDValue(); SDValue N0 = peekThroughBitcast(SVN->getOperand(0)); unsigned Opcode = N0.getOpcode(); if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) return SDValue(); SDValue N00 = N0.getOperand(0); ArrayRef Mask = SVN->getMask(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits(); if (ExtDstSizeInBits % ExtSrcSizeInBits != 0) return SDValue(); unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits; // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1> auto isTruncate = [&Mask, &NumElts](unsigned Scale) { for (unsigned i = 0; i != NumElts; ++i) { if (Mask[i] < 0) continue; if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale)) continue; return false; } return true; }; // At the moment we just handle the case where we've truncated back to the // same size as before the extension. // TODO: handle more extension/truncation cases as cases arise. if (EltSizeInBits != ExtSrcSizeInBits) return SDValue(); // We can remove *extend_vector_inreg only if the truncation happens at // the same scale as the extension. if (isTruncate(ExtScale)) return DAG.getBitcast(VT, N00); return SDValue(); } // Combine shuffles of splat-shuffles of the form: // shuffle (shuffle V, undef, splat-mask), undef, M // If splat-mask contains undef elements, we need to be careful about // introducing undef's in the folded mask which are not the result of composing // the masks of the shuffles. static SDValue combineShuffleOfSplat(ArrayRef UserMask, ShuffleVectorSDNode *Splat, SelectionDAG &DAG) { ArrayRef SplatMask = Splat->getMask(); assert(UserMask.size() == SplatMask.size() && "Mask length mismatch"); // Prefer simplifying to the splat-shuffle, if possible. This is legal if // every undef mask element in the splat-shuffle has a corresponding undef // element in the user-shuffle's mask or if the composition of mask elements // would result in undef. // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask): // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u] // In this case it is not legal to simplify to the splat-shuffle because we // may be exposing the users of the shuffle an undef element at index 1 // which was not there before the combine. // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u] // In this case the composition of masks yields SplatMask, so it's ok to // simplify to the splat-shuffle. // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u] // In this case the composed mask includes all undef elements of SplatMask // and in addition sets element zero to undef. It is safe to simplify to // the splat-shuffle. auto CanSimplifyToExistingSplat = [](ArrayRef UserMask, ArrayRef SplatMask) { for (unsigned i = 0, e = UserMask.size(); i != e; ++i) if (UserMask[i] != -1 && SplatMask[i] == -1 && SplatMask[UserMask[i]] != -1) return false; return true; }; if (CanSimplifyToExistingSplat(UserMask, SplatMask)) return SDValue(Splat, 0); // Create a new shuffle with a mask that is composed of the two shuffles' // masks. SmallVector NewMask; for (int Idx : UserMask) NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), Splat->getOperand(0), Splat->getOperand(1), NewMask); } /// If the shuffle mask is taking exactly one element from the first vector /// operand and passing through all other elements from the second vector /// operand, return the index of the mask element that is choosing an element /// from the first operand. Otherwise, return -1. static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef Mask) { int MaskSize = Mask.size(); int EltFromOp0 = -1; // TODO: This does not match if there are undef elements in the shuffle mask. // Should we ignore undefs in the shuffle mask instead? The trade-off is // removing an instruction (a shuffle), but losing the knowledge that some // vector lanes are not needed. for (int i = 0; i != MaskSize; ++i) { if (Mask[i] >= 0 && Mask[i] < MaskSize) { // We're looking for a shuffle of exactly one element from operand 0. if (EltFromOp0 != -1) return -1; EltFromOp0 = i; } else if (Mask[i] != i + MaskSize) { // Nothing from operand 1 can change lanes. return -1; } } return EltFromOp0; } /// If a shuffle inserts exactly one element from a source vector operand into /// another vector operand and we can access the specified element as a scalar, /// then we can eliminate the shuffle. static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { // First, check if we are taking one element of a vector and shuffling that // element into another vector. ArrayRef Mask = Shuf->getMask(); SmallVector CommutedMask(Mask.begin(), Mask.end()); SDValue Op0 = Shuf->getOperand(0); SDValue Op1 = Shuf->getOperand(1); int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask); if (ShufOp0Index == -1) { // Commute mask and check again. ShuffleVectorSDNode::commuteMask(CommutedMask); ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask); if (ShufOp0Index == -1) return SDValue(); // Commute operands to match the commuted shuffle mask. std::swap(Op0, Op1); Mask = CommutedMask; } // The shuffle inserts exactly one element from operand 0 into operand 1. // Now see if we can access that element as a scalar via a real insert element // instruction. // TODO: We can try harder to locate the element as a scalar. Examples: it // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && "Shuffle mask value must be from operand 0"); if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) return SDValue(); auto *InsIndexC = dyn_cast(Op0.getOperand(2)); if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) return SDValue(); // There's an existing insertelement with constant insertion index, so we // don't need to check the legality/profitability of a replacement operation // that differs at most in the constant value. The target should be able to // lower any of those in a similar way. If not, legalization will expand this // to a scalar-to-vector plus shuffle. // // Note that the shuffle may move the scalar from the position that the insert // element used. Therefore, our new insert element occurs at the shuffle's // mask index value, not the insert's index value. // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf), Op0.getOperand(2).getValueType()); return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), Op1, Op0.getOperand(1), NewInsIndex); } SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG"); // Canonicalize shuffle undef, undef -> undef if (N0.isUndef() && N1.isUndef()) return DAG.getUNDEF(VT); ShuffleVectorSDNode *SVN = cast(N); // Canonicalize shuffle v, v -> v, undef if (N0 == N1) { SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx >= (int)NumElts) Idx -= NumElts; NewMask.push_back(Idx); } return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask); } // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask. if (N0.isUndef()) return DAG.getCommutedVectorShuffle(*SVN); // Remove references to rhs if it is undef if (N1.isUndef()) { bool Changed = false; SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx >= (int)NumElts) { Idx = -1; Changed = true; } NewMask.push_back(Idx); } if (Changed) return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); } // Simplify shuffle mask if a referenced element is UNDEF. if (SDValue V = simplifyShuffleMask(SVN, N0, N1, DAG)) return V; if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) return InsElt; // A shuffle of a single vector that is a splat can always be folded. if (auto *N0Shuf = dyn_cast(N0)) if (N1->isUndef() && N0Shuf->isSplat()) return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG); // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { SDNode *V = N0.getNode(); // If this is a bit convert that changes the element type of the vector but // not the number of vector elements, look through it. Be careful not to // look though conversions that change things like v4f32 to v2f64. if (V->getOpcode() == ISD::BITCAST) { SDValue ConvInput = V->getOperand(0); if (ConvInput.getValueType().isVector() && ConvInput.getValueType().getVectorNumElements() == NumElts) V = ConvInput.getNode(); } if (V->getOpcode() == ISD::BUILD_VECTOR) { assert(V->getNumOperands() == NumElts && "BUILD_VECTOR has wrong number of operands"); SDValue Base; bool AllSame = true; for (unsigned i = 0; i != NumElts; ++i) { if (!V->getOperand(i).isUndef()) { Base = V->getOperand(i); break; } } // Splat of , return if (!Base.getNode()) return N0; for (unsigned i = 0; i != NumElts; ++i) { if (V->getOperand(i) != Base) { AllSame = false; break; } } // Splat of , return if (AllSame) return N0; // Canonicalize any other splat as a build_vector. const SDValue &Splatted = V->getOperand(SVN->getSplatIndex()); SmallVector Ops(NumElts, Splatted); SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops); // We may have jumped through bitcasts, so the type of the // BUILD_VECTOR may not match the type of the shuffle. if (V->getValueType(0) != VT) NewBV = DAG.getBitcast(VT, NewBV); return NewBV; } } // There are various patterns used to build up a vector from smaller vectors, // subvectors, or elements. Scan chains of these and replace unused insertions // or components with undef. if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG)) return S; // Match shuffles that can be converted to any_vector_extend_in_reg. if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes)) return V; // Combine "truncate_vector_in_reg" style shuffles. if (SDValue V = combineTruncationShuffle(SVN, DAG)) return V; if (N0.getOpcode() == ISD::CONCAT_VECTORS && Level < AfterLegalizeVectorOps && (N1.isUndef() || (N1.getOpcode() == ISD::CONCAT_VECTORS && N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) { if (SDValue V = partitionShuffleOfConcats(N, DAG)) return V; } // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI)) return Res; // If this shuffle only has a single input that is a bitcasted shuffle, // attempt to merge the 2 shuffles and suitably bitcast the inputs/output // back to their original types. if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && N1.isUndef() && Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { // Peek through the bitcast only if there is one user. SDValue BC0 = N0; while (BC0.getOpcode() == ISD::BITCAST) { if (!BC0.hasOneUse()) break; BC0 = BC0.getOperand(0); } auto ScaleShuffleMask = [](ArrayRef Mask, int Scale) { if (Scale == 1) return SmallVector(Mask.begin(), Mask.end()); SmallVector NewMask; for (int M : Mask) for (int s = 0; s != Scale; ++s) NewMask.push_back(M < 0 ? -1 : Scale * M + s); return NewMask; }; if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { EVT SVT = VT.getScalarType(); EVT InnerVT = BC0->getValueType(0); EVT InnerSVT = InnerVT.getScalarType(); // Determine which shuffle works with the smaller scalar type. EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT; EVT ScaleSVT = ScaleVT.getScalarType(); if (TLI.isTypeLegal(ScaleVT) && 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) && 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) { int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits(); int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits(); // Scale the shuffle masks to the smaller scalar type. ShuffleVectorSDNode *InnerSVN = cast(BC0); SmallVector InnerMask = ScaleShuffleMask(InnerSVN->getMask(), InnerScale); SmallVector OuterMask = ScaleShuffleMask(SVN->getMask(), OuterScale); // Merge the shuffle masks. SmallVector NewMask; for (int M : OuterMask) NewMask.push_back(M < 0 ? -1 : InnerMask[M]); // Test for shuffle mask legality over both commutations. SDValue SV0 = BC0->getOperand(0); SDValue SV1 = BC0->getOperand(1); bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); if (!LegalMask) { std::swap(SV0, SV1); ShuffleVectorSDNode::commuteMask(NewMask); LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); } if (LegalMask) { SV0 = DAG.getBitcast(ScaleVT, SV0); SV1 = DAG.getBitcast(ScaleVT, SV1); return DAG.getBitcast( VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask)); } } } } // Canonicalize shuffles according to rules: // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { // The incoming shuffle must be of the same type as the result of the // current shuffle. assert(N1->getOperand(0).getValueType() == VT && "Shuffle types don't match"); SDValue SV0 = N1->getOperand(0); SDValue SV1 = N1->getOperand(1); bool HasSameOp0 = N0 == SV0; bool IsSV1Undef = SV1.isUndef(); if (HasSameOp0 || IsSV1Undef || N0 == SV1) // Commute the operands of this shuffle so that next rule // will trigger. return DAG.getCommutedVectorShuffle(*SVN); } // Try to fold according to rules: // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) // Don't try to fold shuffles with illegal type. // Only fold if this shuffle is the only user of the other shuffle. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) && Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { ShuffleVectorSDNode *OtherSV = cast(N0); // Don't try to fold splats; they're likely to simplify somehow, or they // might be free. if (OtherSV->isSplat()) return SDValue(); // The incoming shuffle must be of the same type as the result of the // current shuffle. assert(OtherSV->getOperand(0).getValueType() == VT && "Shuffle types don't match"); SDValue SV0, SV1; SmallVector Mask; // Compute the combined shuffle mask for a shuffle with SV0 as the first // operand, and SV1 as the second operand. for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx < 0) { // Propagate Undef. Mask.push_back(Idx); continue; } SDValue CurrentVec; if (Idx < (int)NumElts) { // This shuffle index refers to the inner shuffle N0. Lookup the inner // shuffle mask to identify which vector is actually referenced. Idx = OtherSV->getMaskElt(Idx); if (Idx < 0) { // Propagate Undef. Mask.push_back(Idx); continue; } CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0) : OtherSV->getOperand(1); } else { // This shuffle index references an element within N1. CurrentVec = N1; } // Simple case where 'CurrentVec' is UNDEF. if (CurrentVec.isUndef()) { Mask.push_back(-1); continue; } // Canonicalize the shuffle index. We don't know yet if CurrentVec // will be the first or second operand of the combined shuffle. Idx = Idx % NumElts; if (!SV0.getNode() || SV0 == CurrentVec) { // Ok. CurrentVec is the left hand side. // Update the mask accordingly. SV0 = CurrentVec; Mask.push_back(Idx); continue; } // Bail out if we cannot convert the shuffle pair into a single shuffle. if (SV1.getNode() && SV1 != CurrentVec) return SDValue(); // Ok. CurrentVec is the right hand side. // Update the mask accordingly. SV1 = CurrentVec; Mask.push_back(Idx + NumElts); } // Check if all indices in Mask are Undef. In case, propagate Undef. bool isUndefMask = true; for (unsigned i = 0; i != NumElts && isUndefMask; ++i) isUndefMask &= Mask[i] < 0; if (isUndefMask) return DAG.getUNDEF(VT); if (!SV0.getNode()) SV0 = DAG.getUNDEF(VT); if (!SV1.getNode()) SV1 = DAG.getUNDEF(VT); // Avoid introducing shuffles with illegal mask. if (!TLI.isShuffleMaskLegal(Mask, VT)) { ShuffleVectorSDNode::commuteMask(Mask); if (!TLI.isShuffleMaskLegal(Mask, VT)) return SDValue(); // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2) std::swap(SV0, SV1); } // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask); } return SDValue(); } SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) { SDValue InVal = N->getOperand(0); EVT VT = N->getValueType(0); // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern // with a VECTOR_SHUFFLE and possible truncate. if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue InVec = InVal->getOperand(0); SDValue EltNo = InVal->getOperand(1); auto InVecT = InVec.getValueType(); if (ConstantSDNode *C0 = dyn_cast(EltNo)) { SmallVector NewMask(InVecT.getVectorNumElements(), -1); int Elt = C0->getZExtValue(); NewMask[0] = Elt; SDValue Val; // If we have an implict truncate do truncate here as long as it's legal. // if it's not legal, this should if (VT.getScalarType() != InVal.getValueType() && InVal.getValueType().isScalarInteger() && isTypeLegal(VT.getScalarType())) { Val = DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal); return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val); } if (VT.getScalarType() == InVecT.getScalarType() && VT.getVectorNumElements() <= InVecT.getVectorNumElements() && TLI.isShuffleMaskLegal(NewMask, VT)) { Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec, DAG.getUNDEF(InVecT), NewMask); // If the initial vector is the correct size this shuffle is a // valid result. if (VT == InVecT) return Val; // If not we must truncate the vector. if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) { MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy); EVT SubVT = EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(), VT.getVectorNumElements()); Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val, ZeroIdx); return Val; } } } } return SDValue(); } SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); // If inserting an UNDEF, just return the original vector. if (N1.isUndef()) return N0; // For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow // us to pull BITCASTs from input to output. if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR) if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode())) return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2); // If this is an insert of an extracted vector into an undef vector, we can // just use the input to the extract. if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) return N1.getOperand(0); // If we are inserting a bitcast value into an undef, with the same // number of elements, just use the bitcast input of the extract. // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 -> // BITCAST (INSERT_SUBVECTOR UNDEF N1 N2) if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST && N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(0).getOperand(1) == N2 && N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements()) { return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); } // If both N1 and N2 are bitcast values on which insert_subvector // would makes sense, pull the bitcast through. // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 -> // BITCAST (INSERT_SUBVECTOR N0 N1 N2) if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { SDValue CN0 = N0.getOperand(0); SDValue CN1 = N1.getOperand(0); if (CN0.getValueType().getVectorElementType() == CN1.getValueType().getVectorElementType() && CN0.getValueType().getVectorNumElements() == VT.getVectorNumElements()) { SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), CN0.getValueType(), CN0, CN1, N2); return DAG.getBitcast(VT, NewINSERT); } } // Combine INSERT_SUBVECTORs where we are inserting to the same index. // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(1).getValueType() == N1.getValueType() && N0.getOperand(2) == N2) return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), N1, N2); if (!isa(N2)) return SDValue(); unsigned InsIdx = cast(N2)->getZExtValue(); // Canonicalize insert_subvector dag nodes. // Example: // (insert_subvector (insert_subvector A, Idx0), Idx1) // -> (insert_subvector (insert_subvector A, Idx1), Idx0) if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && N1.getValueType() == N0.getOperand(1).getValueType() && isa(N0.getOperand(2))) { unsigned OtherIdx = N0.getConstantOperandVal(2); if (InsIdx < OtherIdx) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), N1, N2); AddToWorklist(NewOp.getNode()); return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()), VT, NewOp, N0.getOperand(1), N0.getOperand(2)); } } // If the input vector is a concatenation, and the insert replaces // one of the pieces, we can optimize into a single concat_vectors. if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && N0.getOperand(0).getValueType() == N1.getValueType()) { unsigned Factor = N1.getValueType().getVectorNumElements(); SmallVector Ops(N0->op_begin(), N0->op_end()); Ops[cast(N2)->getZExtValue() / Factor] = N1; return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } return SDValue(); } SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) { SDValue N0 = N->getOperand(0); // fold (fp_to_fp16 (fp16_to_fp op)) -> op if (N0->getOpcode() == ISD::FP16_TO_FP) return N0->getOperand(0); return SDValue(); } SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op) if (N0->getOpcode() == ISD::AND) { ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1)); if (AndConst && AndConst->getAPIntValue() == 0xffff) { return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), N0.getOperand(0)); } } return SDValue(); } /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle /// with the destination vector and a zero vector. /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> /// vector_shuffle V, Zero, <0, 4, 2, 4> SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); + EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = peekThroughBitcast(N->getOperand(1)); SDLoc DL(N); // Make sure we're not running after operation legalization where it // may have custom lowered the vector shuffles. if (LegalOperations) return SDValue(); - if (N->getOpcode() != ISD::AND) - return SDValue(); - if (RHS.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); EVT RVT = RHS.getValueType(); unsigned NumElts = RHS.getNumOperands(); // Attempt to create a valid clear mask, splitting the mask into // sub elements and checking to see if each is // all zeros or all ones - suitable for shuffle masking. auto BuildClearMask = [&](int Split) { int NumSubElts = NumElts * Split; int NumSubBits = RVT.getScalarSizeInBits() / Split; SmallVector Indices; for (int i = 0; i != NumSubElts; ++i) { int EltIdx = i / Split; int SubIdx = i % Split; SDValue Elt = RHS.getOperand(EltIdx); if (Elt.isUndef()) { Indices.push_back(-1); continue; } APInt Bits; if (isa(Elt)) Bits = cast(Elt)->getAPIntValue(); else if (isa(Elt)) Bits = cast(Elt)->getValueAPF().bitcastToAPInt(); else return SDValue(); // Extract the sub element from the constant bit mask. if (DAG.getDataLayout().isBigEndian()) { Bits.lshrInPlace((Split - SubIdx - 1) * NumSubBits); } else { Bits.lshrInPlace(SubIdx * NumSubBits); } if (Split > 1) Bits = Bits.trunc(NumSubBits); if (Bits.isAllOnesValue()) Indices.push_back(i); else if (Bits == 0) Indices.push_back(i + NumSubElts); else return SDValue(); } // Let's see if the target supports this vector_shuffle. EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits); EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts); if (!TLI.isVectorClearMaskLegal(Indices, ClearVT)) return SDValue(); SDValue Zero = DAG.getConstant(0, DL, ClearVT); return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL, DAG.getBitcast(ClearVT, LHS), Zero, Indices)); }; // Determine maximum split level (byte level masking). int MaxSplit = 1; if (RVT.getScalarSizeInBits() % 8 == 0) MaxSplit = RVT.getScalarSizeInBits() / 8; for (int Split = 1; Split <= MaxSplit; ++Split) if (RVT.getScalarSizeInBits() % Split == 0) if (SDValue S = BuildClearMask(Split)) return S; return SDValue(); } /// Visit a binary vector operation, like ADD. SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { assert(N->getValueType(0).isVector() && "SimplifyVBinOp only works on vectors!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Ops[] = {LHS, RHS}; // See if we can constant fold the vector operation. if (SDValue Fold = DAG.FoldConstantVectorArithmetic( N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags())) return Fold; - - // Try to convert a constant mask AND into a shuffle clear mask. - if (SDValue Shuffle = XformToShuffleWithZero(N)) - return Shuffle; // Type legalization might introduce new shuffles in the DAG. // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask))) // -> (shuffle (VBinOp (A, B)), Undef, Mask). if (LegalTypes && isa(LHS) && isa(RHS) && LHS.hasOneUse() && RHS.hasOneUse() && LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef()) { ShuffleVectorSDNode *SVN0 = cast(LHS); ShuffleVectorSDNode *SVN1 = cast(RHS); if (SVN0->getMask().equals(SVN1->getMask())) { EVT VT = N->getValueType(0); SDValue UndefVector = LHS.getOperand(1); SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS.getOperand(0), RHS.getOperand(0), N->getFlags()); AddUsersToWorklist(N); return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector, SVN0->getMask()); } } return SDValue(); } SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2) { assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!"); SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2, cast(N0.getOperand(2))->get()); // If we got a simplified select_cc node back from SimplifySelectCC, then // break it down into a new SETCC node, and a new SELECT node, and then return // the SELECT node, since we were called with a SELECT node. if (SCC.getNode()) { // Check to see if we got a select_cc back (to turn into setcc/select). // Otherwise, just return whatever node we got back, like fabs. if (SCC.getOpcode() == ISD::SELECT_CC) { SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0), N0.getValueType(), SCC.getOperand(0), SCC.getOperand(1), SCC.getOperand(4)); AddToWorklist(SETCC.getNode()); return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC, SCC.getOperand(2), SCC.getOperand(3)); } return SCC; } return SDValue(); } /// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values /// being selected between, see if we can simplify the select. Callers of this /// should assume that TheSelect is deleted if this returns true. As such, they /// should return the appropriate thing (e.g. the node) back to the top-level of /// the DAG combiner loop to avoid it being looked at. bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, SDValue RHS) { // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) // The select + setcc is redundant, because fsqrt returns NaN for X < 0. if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) { if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) { // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?)) SDValue Sqrt = RHS; ISD::CondCode CC; SDValue CmpLHS; const ConstantFPSDNode *Zero = nullptr; if (TheSelect->getOpcode() == ISD::SELECT_CC) { CC = dyn_cast(TheSelect->getOperand(4))->get(); CmpLHS = TheSelect->getOperand(0); Zero = isConstOrConstSplatFP(TheSelect->getOperand(1)); } else { // SELECT or VSELECT SDValue Cmp = TheSelect->getOperand(0); if (Cmp.getOpcode() == ISD::SETCC) { CC = dyn_cast(Cmp.getOperand(2))->get(); CmpLHS = Cmp.getOperand(0); Zero = isConstOrConstSplatFP(Cmp.getOperand(1)); } } if (Zero && Zero->isZero() && Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT || CC == ISD::SETULT || CC == ISD::SETLT)) { // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) CombineTo(TheSelect, Sqrt); return true; } } } // Cannot simplify select with vector condition if (TheSelect->getOperand(0).getValueType().isVector()) return false; // If this is a select from two identical things, try to pull the operation // through the select. if (LHS.getOpcode() != RHS.getOpcode() || !LHS.hasOneUse() || !RHS.hasOneUse()) return false; // If this is a load and the token chain is identical, replace the select // of two loads with a load through a select of the address to load from. // This triggers in things like "select bool X, 10.0, 123.0" after the FP // constants have been dropped into the constant pool. if (LHS.getOpcode() == ISD::LOAD) { LoadSDNode *LLD = cast(LHS); LoadSDNode *RLD = cast(RHS); // Token chains must be identical. if (LHS.getOperand(0) != RHS.getOperand(0) || // Do not let this transformation reduce the number of volatile loads. LLD->isVolatile() || RLD->isVolatile() || // FIXME: If either is a pre/post inc/dec load, // we'd need to split out the address adjustment. LLD->isIndexed() || RLD->isIndexed() || // If this is an EXTLOAD, the VT's must match. LLD->getMemoryVT() != RLD->getMemoryVT() || // If this is an EXTLOAD, the kind of extension must match. (LLD->getExtensionType() != RLD->getExtensionType() && // The only exception is if one of the extensions is anyext. LLD->getExtensionType() != ISD::EXTLOAD && RLD->getExtensionType() != ISD::EXTLOAD) || // FIXME: this discards src value information. This is // over-conservative. It would be beneficial to be able to remember // both potential memory locations. Since we are discarding // src value info, don't do the transformation if the memory // locations are not in the default address space. LLD->getPointerInfo().getAddrSpace() != 0 || RLD->getPointerInfo().getAddrSpace() != 0 || !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(), LLD->getBasePtr().getValueType())) return false; // Check that the select condition doesn't reach either load. If so, // folding this will induce a cycle into the DAG. If not, this is safe to // xform, so create a select of the addresses. SDValue Addr; if (TheSelect->getOpcode() == ISD::SELECT) { SDNode *CondNode = TheSelect->getOperand(0).getNode(); if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) || (RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode))) return false; // The loads must not depend on one another. if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD)) return false; Addr = DAG.getSelect(SDLoc(TheSelect), LLD->getBasePtr().getValueType(), TheSelect->getOperand(0), LLD->getBasePtr(), RLD->getBasePtr()); } else { // Otherwise SELECT_CC SDNode *CondLHS = TheSelect->getOperand(0).getNode(); SDNode *CondRHS = TheSelect->getOperand(1).getNode(); if ((LLD->hasAnyUseOfValue(1) && (LLD->isPredecessorOf(CondLHS) || LLD->isPredecessorOf(CondRHS))) || (RLD->hasAnyUseOfValue(1) && (RLD->isPredecessorOf(CondLHS) || RLD->isPredecessorOf(CondRHS)))) return false; Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect), LLD->getBasePtr().getValueType(), TheSelect->getOperand(0), TheSelect->getOperand(1), LLD->getBasePtr(), RLD->getBasePtr(), TheSelect->getOperand(4)); } SDValue Load; // It is safe to replace the two loads if they have different alignments, // but the new load must be the minimum (most restrictive) alignment of the // inputs. unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment()); MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags(); if (!RLD->isInvariant()) MMOFlags &= ~MachineMemOperand::MOInvariant; if (!RLD->isDereferenceable()) MMOFlags &= ~MachineMemOperand::MODereferenceable; if (LLD->getExtensionType() == ISD::NON_EXTLOAD) { // FIXME: Discards pointer and AA info. Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect), LLD->getChain(), Addr, MachinePointerInfo(), Alignment, MMOFlags); } else { // FIXME: Discards pointer and AA info. Load = DAG.getExtLoad( LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType() : LLD->getExtensionType(), SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr, MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags); } // Users of the select now use the result of the load. CombineTo(TheSelect, Load); // Users of the old loads now use the new load's chain. We know the // old-load value is dead now. CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1)); CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1)); return true; } return false; } /// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and /// bitwise 'and'. SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC) { // If this is a select where the false operand is zero and the compare is a // check of the sign bit, see if we can perform the "gzip trick": // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A EVT XType = N0.getValueType(); EVT AType = N2.getValueType(); if (!isNullConstant(N3) || !XType.bitsGE(AType)) return SDValue(); // If the comparison is testing for a positive value, we have to invert // the sign bit mask, so only do that transform if the target has a bitwise // 'and not' instruction (the invert is free). if (CC == ISD::SETGT && TLI.hasAndNot(N2)) { // (X > -1) ? A : 0 // (X > 0) ? X : 0 <-- This is canonical signed max. if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2))) return SDValue(); } else if (CC == ISD::SETLT) { // (X < 0) ? A : 0 // (X < 1) ? X : 0 <-- This is un-canonicalized signed min. if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2))) return SDValue(); } else { return SDValue(); } // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit // constant. EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType()); auto *N2C = dyn_cast(N2.getNode()); if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) { unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1; SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt); AddToWorklist(Shift.getNode()); if (XType.bitsGT(AType)) { Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); AddToWorklist(Shift.getNode()); } if (CC == ISD::SETGT) Shift = DAG.getNOT(DL, Shift, AType); return DAG.getNode(ISD::AND, DL, AType, Shift, N2); } SDValue ShiftAmt = DAG.getConstant(XType.getSizeInBits() - 1, DL, ShiftAmtTy); SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt); AddToWorklist(Shift.getNode()); if (XType.bitsGT(AType)) { Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); AddToWorklist(Shift.getNode()); } if (CC == ISD::SETGT) Shift = DAG.getNOT(DL, Shift, AType); return DAG.getNode(ISD::AND, DL, AType, Shift, N2); } /// Simplify an expression of the form (N0 cond N1) ? N2 : N3 /// where 'cond' is the comparison specified by CC. SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC, bool NotExtCompare) { // (x ? y : y) -> y. if (N2 == N3) return N2; EVT VT = N2.getValueType(); ConstantSDNode *N1C = dyn_cast(N1.getNode()); ConstantSDNode *N2C = dyn_cast(N2.getNode()); // Determine if the condition we're dealing with is constant SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, CC, DL, false); if (SCC.getNode()) AddToWorklist(SCC.getNode()); if (ConstantSDNode *SCCC = dyn_cast_or_null(SCC.getNode())) { // fold select_cc true, x, y -> x // fold select_cc false, x, y -> y return !SCCC->isNullValue() ? N2 : N3; } // Check to see if we can simplify the select into an fabs node if (ConstantFPSDNode *CFP = dyn_cast(N1)) { // Allow either -0.0 or 0.0 if (CFP->isZero()) { // select (setg[te] X, +/-0.0), X, fneg(X) -> fabs if ((CC == ISD::SETGE || CC == ISD::SETGT) && N0 == N2 && N3.getOpcode() == ISD::FNEG && N2 == N3.getOperand(0)) return DAG.getNode(ISD::FABS, DL, VT, N0); // select (setl[te] X, +/-0.0), fneg(X), X -> fabs if ((CC == ISD::SETLT || CC == ISD::SETLE) && N0 == N3 && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N3) return DAG.getNode(ISD::FABS, DL, VT, N3); } } // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)" // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0 // in it. This is a win when the constant is not otherwise available because // it replaces two constant pool loads with one. We only do this if the FP // type is known to be legal, because if it isn't, then we are before legalize // types an we want the other legalization to happen first (e.g. to avoid // messing with soft float) and if the ConstantFP is not legal, because if // it is legal, we may not need to store the FP constant in a constant pool. if (ConstantFPSDNode *TV = dyn_cast(N2)) if (ConstantFPSDNode *FV = dyn_cast(N3)) { if (TLI.isTypeLegal(N2.getValueType()) && (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) != TargetLowering::Legal && !TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) && !TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) && // If both constants have multiple uses, then we won't need to do an // extra load, they are likely around in registers for other users. (TV->hasOneUse() || FV->hasOneUse())) { Constant *Elts[] = { const_cast(FV->getConstantFPValue()), const_cast(TV->getConstantFPValue()) }; Type *FPTy = Elts[0]->getType(); const DataLayout &TD = DAG.getDataLayout(); // Create a ConstantArray of the two constants. Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts); SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()), TD.getPrefTypeAlignment(FPTy)); unsigned Alignment = cast(CPIdx)->getAlignment(); // Get the offsets to the 0 and 1 element of the array so that we can // select between them. SDValue Zero = DAG.getIntPtrConstant(0, DL); unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType()); SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV)); SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); AddToWorklist(Cond.getNode()); SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero); AddToWorklist(CstOffset.getNode()); CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset); AddToWorklist(CPIdx.getNode()); return DAG.getLoad( TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); } } if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC)) return V; // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A) // where y is has a single bit set. // A plaintext description would be, we can turn the SELECT_CC into an AND // when the condition can be materialized as an all-ones register. Any // single bit-test can be materialized as an all-ones register with // shift-left and shift-right-arith. if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND && N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) { SDValue AndLHS = N0->getOperand(0); ConstantSDNode *ConstAndRHS = dyn_cast(N0->getOperand(1)); if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { // Shift the tested bit over the sign bit. const APInt &AndMask = ConstAndRHS->getAPIntValue(); SDValue ShlAmt = DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS), getShiftAmountTy(AndLHS.getValueType())); SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt); // Now arithmetic right shift it all the way over, so the result is either // all-ones, or zero. SDValue ShrAmt = DAG.getConstant(AndMask.getBitWidth() - 1, SDLoc(Shl), getShiftAmountTy(Shl.getValueType())); SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt); return DAG.getNode(ISD::AND, DL, VT, Shr, N3); } } // fold select C, 16, 0 -> shl C, 4 if (N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2() && TLI.getBooleanContents(N0.getValueType()) == TargetLowering::ZeroOrOneBooleanContent) { // If the caller doesn't want us to simplify this into a zext of a compare, // don't do it. if (NotExtCompare && N2C->isOne()) return SDValue(); // Get a SetCC of the condition // NOTE: Don't create a SETCC if it's not legal on this target. if (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) { SDValue Temp, SCC; // cast from setcc result type to select result type if (LegalTypes) { SCC = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); if (N2.getValueType().bitsLT(SCC.getValueType())) Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), N2.getValueType()); else Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), N2.getValueType(), SCC); } else { SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC); Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), N2.getValueType(), SCC); } AddToWorklist(SCC.getNode()); AddToWorklist(Temp.getNode()); if (N2C->isOne()) return Temp; // shl setcc result by log2 n2c return DAG.getNode( ISD::SHL, DL, N2.getValueType(), Temp, DAG.getConstant(N2C->getAPIntValue().logBase2(), SDLoc(Temp), getShiftAmountTy(Temp.getValueType()))); } } // Check to see if this is an integer abs. // select_cc setg[te] X, 0, X, -X -> // select_cc setgt X, -1, X, -X -> // select_cc setl[te] X, 0, -X, X -> // select_cc setlt X, 1, -X, X -> // Y = sra (X, size(X)-1); xor (add (X, Y), Y) if (N1C) { ConstantSDNode *SubC = nullptr; if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) || (N1C->isAllOnesValue() && CC == ISD::SETGT)) && N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1)) SubC = dyn_cast(N3.getOperand(0)); else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) || (N1C->isOne() && CC == ISD::SETLT)) && N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1)) SubC = dyn_cast(N2.getOperand(0)); EVT XType = N0.getValueType(); if (SubC && SubC->isNullValue() && XType.isInteger()) { SDLoc DL(N0); SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, DAG.getConstant(XType.getSizeInBits() - 1, DL, getShiftAmountTy(N0.getValueType()))); SDValue Add = DAG.getNode(ISD::ADD, DL, XType, N0, Shift); AddToWorklist(Shift.getNode()); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::XOR, DL, XType, Add, Shift); } } // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X) // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X) // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X) // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X) // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X) // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X) // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X) // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X) if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { SDValue ValueOnZero = N2; SDValue Count = N3; // If the condition is NE instead of E, swap the operands. if (CC == ISD::SETNE) std::swap(ValueOnZero, Count); // Check if the value on zero is a constant equal to the bits in the type. if (auto *ValueOnZeroC = dyn_cast(ValueOnZero)) { if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) { // If the other operand is cttz/cttz_zero_undef of N0, and cttz is // legal, combine to just cttz. if ((Count.getOpcode() == ISD::CTTZ || Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) && N0 == Count.getOperand(0) && (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT))) return DAG.getNode(ISD::CTTZ, DL, VT, N0); // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is // legal, combine to just ctlz. if ((Count.getOpcode() == ISD::CTLZ || Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) && N0 == Count.getOperand(0) && (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT))) return DAG.getNode(ISD::CTLZ, DL, VT, N0); } } } return SDValue(); } /// This is a stub for TargetLowering::SimplifySetCC. SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans) { TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); } /// Given an ISD::SDIV node expressing a divide by constant, return /// a DAG expression to select that will generate the same value by multiplying /// by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue DAGCombiner::BuildSDIV(SDNode *N) { // when optimising for minimum size, we don't want to expand a div to a mul // and a shift. if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); if (!C) return SDValue(); // Avoid division by zero. if (C->isNullValue()) return SDValue(); std::vector Built; SDValue S = TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); for (SDNode *N : Built) AddToWorklist(N); return S; } /// Given an ISD::SDIV node expressing a divide by constant power of 2, return a /// DAG expression that will generate the same value by right shifting. SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) { ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); if (!C) return SDValue(); // Avoid division by zero. if (C->isNullValue()) return SDValue(); std::vector Built; SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, &Built); for (SDNode *N : Built) AddToWorklist(N); return S; } /// Given an ISD::UDIV node expressing a divide by constant, return a DAG /// expression that will generate the same value by multiplying by a magic /// number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue DAGCombiner::BuildUDIV(SDNode *N) { // when optimising for minimum size, we don't want to expand a div to a mul // and a shift. if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); if (!C) return SDValue(); // Avoid division by zero. if (C->isNullValue()) return SDValue(); std::vector Built; SDValue S = TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built); for (SDNode *N : Built) AddToWorklist(N); return S; } /// Determines the LogBase2 value for a non-null input value using the /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { EVT VT = V.getValueType(); unsigned EltBits = VT.getScalarSizeInBits(); SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V); SDValue Base = DAG.getConstant(EltBits - 1, DL, VT); SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz); return LogBase2; } /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) /// For the reciprocal, we need to find the zero of the function: /// F(X) = A X - 1 [which has a zero at X = 1/A] /// => /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form /// does not require additional intermediate precision] SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) { if (Level >= AfterLegalizeDAG) return SDValue(); // TODO: Handle half and/or extended types? EVT VT = Op.getValueType(); if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) return SDValue(); // If estimates are explicitly disabled for this function, we're done. MachineFunction &MF = DAG.getMachineFunction(); int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF); if (Enabled == TLI.ReciprocalEstimate::Disabled) return SDValue(); // Estimates may be explicitly enabled for this type with a custom number of // refinement steps. int Iterations = TLI.getDivRefinementSteps(VT, MF); if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) { AddToWorklist(Est.getNode()); if (Iterations) { EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); // Newton iterations: Est = Est + Est (1 - Arg * Est) for (int i = 0; i < Iterations; ++i) { SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); AddToWorklist(NewEst.getNode()); Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags); AddToWorklist(Est.getNode()); } } return Est; } return SDValue(); } /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) /// For the reciprocal sqrt, we need to find the zero of the function: /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] /// => /// X_{i+1} = X_i (1.5 - A X_i^2 / 2) /// As a result, we precompute A/2 prior to the iteration loop. SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations, SDNodeFlags Flags, bool Reciprocal) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT); // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that // this entire sequence requires only one FP constant. SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags); AddToWorklist(HalfArg.getNode()); HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags); AddToWorklist(HalfArg.getNode()); // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) for (unsigned i = 0; i < Iterations; ++i) { SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags); AddToWorklist(NewEst.getNode()); Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); AddToWorklist(Est.getNode()); } // If non-reciprocal square root is requested, multiply the result by Arg. if (!Reciprocal) { Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags); AddToWorklist(Est.getNode()); } return Est; } /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) /// For the reciprocal sqrt, we need to find the zero of the function: /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] /// => /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations, SDNodeFlags Flags, bool Reciprocal) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT); SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT); // This routine must enter the loop below to work correctly // when (Reciprocal == false). assert(Iterations > 0); // Newton iterations for reciprocal square root: // E = (E * -0.5) * ((A * E) * E + -3.0) for (unsigned i = 0; i < Iterations; ++i) { SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags); AddToWorklist(AE.getNode()); SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags); AddToWorklist(AEE.getNode()); SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags); AddToWorklist(RHS.getNode()); // When calculating a square root at the last iteration build: // S = ((A * E) * -0.5) * ((A * E) * E + -3.0) // (notice a common subexpression) SDValue LHS; if (Reciprocal || (i + 1) < Iterations) { // RSQRT: LHS = (E * -0.5) LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags); } else { // SQRT: LHS = (A * E) * -0.5 LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags); } AddToWorklist(LHS.getNode()); Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags); AddToWorklist(Est.getNode()); } return Est; } /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if /// Op can be zero. SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Reciprocal) { if (Level >= AfterLegalizeDAG) return SDValue(); // TODO: Handle half and/or extended types? EVT VT = Op.getValueType(); if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) return SDValue(); // If estimates are explicitly disabled for this function, we're done. MachineFunction &MF = DAG.getMachineFunction(); int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF); if (Enabled == TLI.ReciprocalEstimate::Disabled) return SDValue(); // Estimates may be explicitly enabled for this type with a custom number of // refinement steps. int Iterations = TLI.getSqrtRefinementSteps(VT, MF); bool UseOneConstNR = false; if (SDValue Est = TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR, Reciprocal)) { AddToWorklist(Est.getNode()); if (Iterations) { Est = UseOneConstNR ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal) : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); if (!Reciprocal) { // Unfortunately, Est is now NaN if the input was exactly 0.0. // Select out this case and force the answer to 0.0. EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); EVT CCVT = getSetCCResultType(VT); SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); AddToWorklist(ZeroCmp.getNode()); Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, ZeroCmp, FPZero, Est); AddToWorklist(Est.getNode()); } } return Est; } return SDValue(); } SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) { return buildSqrtEstimateImpl(Op, Flags, true); } SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) { return buildSqrtEstimateImpl(Op, Flags, false); } /// Return true if there is any possibility that the two addresses overlap. bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { // If they are the same then they must be aliases. if (Op0->getBasePtr() == Op1->getBasePtr()) return true; // If they are both volatile then they cannot be reordered. if (Op0->isVolatile() && Op1->isVolatile()) return true; // If one operation reads from invariant memory, and the other may store, they // cannot alias. These should really be checking the equivalent of mayWrite, // but it only matters for memory nodes other than load /store. if (Op0->isInvariant() && Op1->writeMem()) return false; if (Op1->isInvariant() && Op0->writeMem()) return false; unsigned NumBytes0 = Op0->getMemoryVT().getStoreSize(); unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize(); // Check for BaseIndexOffset matching. BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG); BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG); int64_t PtrDiff; if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0)); // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be // able to calculate their relative offset if at least one arises // from an alloca. However, these allocas cannot overlap and we // can infer there is no alias. if (auto *A = dyn_cast(BasePtr0.getBase())) if (auto *B = dyn_cast(BasePtr1.getBase())) { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); // If the base are the same frame index but the we couldn't find a // constant offset, (indices are different) be conservative. if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) || !MFI.isFixedObjectIndex(B->getIndex()))) return false; } bool IsFI0 = isa(BasePtr0.getBase()); bool IsFI1 = isa(BasePtr1.getBase()); bool IsGV0 = isa(BasePtr0.getBase()); bool IsGV1 = isa(BasePtr1.getBase()); bool IsCV0 = isa(BasePtr0.getBase()); bool IsCV1 = isa(BasePtr1.getBase()); // If of mismatched base types or checkable indices we can check // they do not alias. if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) || (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) && (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1)) return false; // If we know required SrcValue1 and SrcValue2 have relatively large alignment // compared to the size and offset of the access, we may be able to prove they // do not alias. This check is conservative for now to catch cases created by // splitting vector types. int64_t SrcValOffset0 = Op0->getSrcValueOffset(); int64_t SrcValOffset1 = Op1->getSrcValueOffset(); unsigned OrigAlignment0 = Op0->getOriginalAlignment(); unsigned OrigAlignment1 = Op1->getOriginalAlignment(); if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 && NumBytes0 == NumBytes1 && OrigAlignment0 > NumBytes0) { int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0; int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1; // There is no overlap between these relatively aligned accesses of similar // size. Return no alias. if ((OffAlign0 + NumBytes0) <= OffAlign1 || (OffAlign1 + NumBytes1) <= OffAlign0) return false; } bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 ? CombinerGlobalAA : DAG.getSubtarget().useAA(); #ifndef NDEBUG if (CombinerAAOnlyFunc.getNumOccurrences() && CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) UseAA = false; #endif if (UseAA && AA && Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) { // Use alias analysis information. int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1); int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset; int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset; AliasResult AAResult = AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0, UseTBAA ? Op0->getAAInfo() : AAMDNodes()), MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1, UseTBAA ? Op1->getAAInfo() : AAMDNodes()) ); if (AAResult == NoAlias) return false; } // Otherwise we have to assume they alias. return true; } /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, SmallVectorImpl &Aliases) { SmallVector Chains; // List of chains to visit. SmallPtrSet Visited; // Visited node set. // Get alias information for node. bool IsLoad = isa(N) && !cast(N)->isVolatile(); // Starting off. Chains.push_back(OriginalChain); unsigned Depth = 0; // Look at each chain and determine if it is an alias. If so, add it to the // aliases list. If not, then continue up the chain looking for the next // candidate. while (!Chains.empty()) { SDValue Chain = Chains.pop_back_val(); // For TokenFactor nodes, look at each operand and only continue up the // chain until we reach the depth limit. // // FIXME: The depth check could be made to return the last non-aliasing // chain we found before we hit a tokenfactor rather than the original // chain. if (Depth > TLI.getGatherAllAliasesMaxDepth()) { Aliases.clear(); Aliases.push_back(OriginalChain); return; } // Don't bother if we've been before. if (!Visited.insert(Chain.getNode()).second) continue; switch (Chain.getOpcode()) { case ISD::EntryToken: // Entry token is ideal chain operand, but handled in FindBetterChain. break; case ISD::LOAD: case ISD::STORE: { // Get alias information for Chain. bool IsOpLoad = isa(Chain.getNode()) && !cast(Chain.getNode())->isVolatile(); // If chain is alias then stop here. if (!(IsLoad && IsOpLoad) && isAlias(cast(N), cast(Chain.getNode()))) { Aliases.push_back(Chain); } else { // Look further up the chain. Chains.push_back(Chain.getOperand(0)); ++Depth; } break; } case ISD::TokenFactor: // We have to check each of the operands of the token factor for "small" // token factors, so we queue them up. Adding the operands to the queue // (stack) in reverse order maintains the original order and increases the // likelihood that getNode will find a matching token factor (CSE.) if (Chain.getNumOperands() > 16) { Aliases.push_back(Chain); break; } for (unsigned n = Chain.getNumOperands(); n;) Chains.push_back(Chain.getOperand(--n)); ++Depth; break; case ISD::CopyFromReg: // Forward past CopyFromReg. Chains.push_back(Chain.getOperand(0)); ++Depth; break; default: // For all other instructions we will just have to take what we can get. Aliases.push_back(Chain); break; } } } /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain /// (aliasing node.) SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { if (OptLevel == CodeGenOpt::None) return OldChain; // Ops for replacing token factor. SmallVector Aliases; // Accumulate all the aliases to this node. GatherAllAliases(N, OldChain, Aliases); // If no operands then chain to entry token. if (Aliases.size() == 0) return DAG.getEntryNode(); // If a single operand then chain to it. We don't need to revisit it. if (Aliases.size() == 1) return Aliases[0]; // Construct a custom tailored token factor. return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases); } // This function tries to collect a bunch of potentially interesting // nodes to improve the chains of, all at once. This might seem // redundant, as this function gets called when visiting every store // node, so why not let the work be done on each store as it's visited? // // I believe this is mainly important because MergeConsecutiveStores // is unable to deal with merging stores of different sizes, so unless // we improve the chains of all the potential candidates up-front // before running MergeConsecutiveStores, it might only see some of // the nodes that will eventually be candidates, and then not be able // to go from a partially-merged state to the desired final // fully-merged state. bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { if (OptLevel == CodeGenOpt::None) return false; // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); // We must have a base and an offset. if (!BasePtr.getBase().getNode()) return false; // Do not handle stores to undef base pointers. if (BasePtr.getBase().isUndef()) return false; SmallVector ChainedStores; ChainedStores.push_back(St); // Walk up the chain and look for nodes with offsets from the same // base pointer. Stop when reaching an instruction with a different kind // or instruction which has a different base pointer. StoreSDNode *Index = St; while (Index) { // If the chain has more than one use, then we can't reorder the mem ops. if (Index != St && !SDValue(Index, 0)->hasOneUse()) break; if (Index->isVolatile() || Index->isIndexed()) break; // Find the base pointer and offset for this memory node. BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG); // Check that the base pointer is the same as the original one. if (!BasePtr.equalBaseIndex(Ptr, DAG)) break; // Walk up the chain to find the next store node, ignoring any // intermediate loads. Any other kind of node will halt the loop. SDNode *NextInChain = Index->getChain().getNode(); while (true) { if (StoreSDNode *STn = dyn_cast(NextInChain)) { // We found a store node. Use it for the next iteration. if (STn->isVolatile() || STn->isIndexed()) { Index = nullptr; break; } ChainedStores.push_back(STn); Index = STn; break; } else if (LoadSDNode *Ldn = dyn_cast(NextInChain)) { NextInChain = Ldn->getChain().getNode(); continue; } else { Index = nullptr; break; } } // end while } // At this point, ChainedStores lists all of the Store nodes // reachable by iterating up through chain nodes matching the above // conditions. For each such store identified, try to find an // earlier chain to attach the store to which won't violate the // required ordering. bool MadeChangeToSt = false; SmallVector, 8> BetterChains; for (StoreSDNode *ChainedStore : ChainedStores) { SDValue Chain = ChainedStore->getChain(); SDValue BetterChain = FindBetterChain(ChainedStore, Chain); if (Chain != BetterChain) { if (ChainedStore == St) MadeChangeToSt = true; BetterChains.push_back(std::make_pair(ChainedStore, BetterChain)); } } // Do all replacements after finding the replacements to make to avoid making // the chains more complicated by introducing new TokenFactors. for (auto Replacement : BetterChains) replaceStoreChain(Replacement.first, Replacement.second); return MadeChangeToSt; } /// This is the entry point for the file. void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA, CodeGenOpt::Level OptLevel) { /// This is the main entry point to this class. DAGCombiner(*this, AA, OptLevel).Run(Level); } Index: vendor/llvm/dist/lib/MC/MCParser/DarwinAsmParser.cpp =================================================================== --- vendor/llvm/dist/lib/MC/MCParser/DarwinAsmParser.cpp (revision 327151) +++ vendor/llvm/dist/lib/MC/MCParser/DarwinAsmParser.cpp (revision 327152) @@ -1,1145 +1,1145 @@ //===- DarwinAsmParser.cpp - Darwin (Mach-O) Assembly Parser --------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCAsmParserExtension.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/SectionKind.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include #include #include using namespace llvm; namespace { /// \brief Implementation of directive handling which is shared across all /// Darwin targets. class DarwinAsmParser : public MCAsmParserExtension { template void addDirectiveHandler(StringRef Directive) { MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair( this, HandleDirective); getParser().addDirectiveHandler(Directive, Handler); } bool parseSectionSwitch(StringRef Segment, StringRef Section, unsigned TAA = 0, unsigned ImplicitAlign = 0, unsigned StubSize = 0); SMLoc LastVersionDirective; public: DarwinAsmParser() = default; void Initialize(MCAsmParser &Parser) override { // Call the base implementation. this->MCAsmParserExtension::Initialize(Parser); addDirectiveHandler<&DarwinAsmParser::parseDirectiveAltEntry>(".alt_entry"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveDesc>(".desc"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveIndirectSymbol>( ".indirect_symbol"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveLsym>(".lsym"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveSubsectionsViaSymbols>( ".subsections_via_symbols"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveDumpOrLoad>(".dump"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveDumpOrLoad>(".load"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveSection>(".section"); addDirectiveHandler<&DarwinAsmParser::parseDirectivePushSection>( ".pushsection"); addDirectiveHandler<&DarwinAsmParser::parseDirectivePopSection>( ".popsection"); addDirectiveHandler<&DarwinAsmParser::parseDirectivePrevious>(".previous"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveSecureLogUnique>( ".secure_log_unique"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveSecureLogReset>( ".secure_log_reset"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveTBSS>(".tbss"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveZerofill>(".zerofill"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveDataRegion>( ".data_region"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveDataRegionEnd>( ".end_data_region"); // Special section directives. addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveBss>(".bss"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveConst>(".const"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveConstData>( ".const_data"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveConstructor>( ".constructor"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveCString>( ".cstring"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveData>(".data"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveDestructor>( ".destructor"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveDyld>(".dyld"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveFVMLibInit0>( ".fvmlib_init0"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveFVMLibInit1>( ".fvmlib_init1"); addDirectiveHandler< &DarwinAsmParser::parseSectionDirectiveLazySymbolPointers>( ".lazy_symbol_pointer"); addDirectiveHandler<&DarwinAsmParser::parseDirectiveLinkerOption>( ".linker_option"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveLiteral16>( ".literal16"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveLiteral4>( ".literal4"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveLiteral8>( ".literal8"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveModInitFunc>( ".mod_init_func"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveModTermFunc>( ".mod_term_func"); addDirectiveHandler< &DarwinAsmParser::parseSectionDirectiveNonLazySymbolPointers>( ".non_lazy_symbol_pointer"); addDirectiveHandler< &DarwinAsmParser::parseSectionDirectiveThreadLocalVariablePointers>( ".thread_local_variable_pointer"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCCatClsMeth>( ".objc_cat_cls_meth"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCCatInstMeth>( ".objc_cat_inst_meth"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCCategory>( ".objc_category"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCClass>( ".objc_class"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCClassNames>( ".objc_class_names"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCClassVars>( ".objc_class_vars"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCClsMeth>( ".objc_cls_meth"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCClsRefs>( ".objc_cls_refs"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCInstMeth>( ".objc_inst_meth"); addDirectiveHandler< &DarwinAsmParser::parseSectionDirectiveObjCInstanceVars>( ".objc_instance_vars"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCMessageRefs>( ".objc_message_refs"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCMetaClass>( ".objc_meta_class"); addDirectiveHandler< &DarwinAsmParser::parseSectionDirectiveObjCMethVarNames>( ".objc_meth_var_names"); addDirectiveHandler< &DarwinAsmParser::parseSectionDirectiveObjCMethVarTypes>( ".objc_meth_var_types"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCModuleInfo>( ".objc_module_info"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCProtocol>( ".objc_protocol"); addDirectiveHandler< &DarwinAsmParser::parseSectionDirectiveObjCSelectorStrs>( ".objc_selector_strs"); addDirectiveHandler< &DarwinAsmParser::parseSectionDirectiveObjCStringObject>( ".objc_string_object"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveObjCSymbols>( ".objc_symbols"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectivePICSymbolStub>( ".picsymbol_stub"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveStaticConst>( ".static_const"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveStaticData>( ".static_data"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveSymbolStub>( ".symbol_stub"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveTData>(".tdata"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveText>(".text"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveThreadInitFunc>( ".thread_init_func"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveTLV>(".tlv"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveIdent>(".ident"); addDirectiveHandler<&DarwinAsmParser::parseWatchOSVersionMin>( ".watchos_version_min"); addDirectiveHandler<&DarwinAsmParser::parseTvOSVersionMin>( ".tvos_version_min"); addDirectiveHandler<&DarwinAsmParser::parseIOSVersionMin>( ".ios_version_min"); addDirectiveHandler<&DarwinAsmParser::parseMacOSXVersionMin>( ".macosx_version_min"); addDirectiveHandler<&DarwinAsmParser::parseBuildVersion>(".build_version"); LastVersionDirective = SMLoc(); } bool parseDirectiveAltEntry(StringRef, SMLoc); bool parseDirectiveDesc(StringRef, SMLoc); bool parseDirectiveIndirectSymbol(StringRef, SMLoc); bool parseDirectiveDumpOrLoad(StringRef, SMLoc); bool parseDirectiveLsym(StringRef, SMLoc); bool parseDirectiveLinkerOption(StringRef, SMLoc); bool parseDirectiveSection(StringRef, SMLoc); bool parseDirectivePushSection(StringRef, SMLoc); bool parseDirectivePopSection(StringRef, SMLoc); bool parseDirectivePrevious(StringRef, SMLoc); bool parseDirectiveSecureLogReset(StringRef, SMLoc); bool parseDirectiveSecureLogUnique(StringRef, SMLoc); bool parseDirectiveSubsectionsViaSymbols(StringRef, SMLoc); bool parseDirectiveTBSS(StringRef, SMLoc); bool parseDirectiveZerofill(StringRef, SMLoc); bool parseDirectiveDataRegion(StringRef, SMLoc); bool parseDirectiveDataRegionEnd(StringRef, SMLoc); // Named Section Directive bool parseSectionDirectiveBss(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__bss"); } bool parseSectionDirectiveConst(StringRef, SMLoc) { return parseSectionSwitch("__TEXT", "__const"); } bool parseSectionDirectiveStaticConst(StringRef, SMLoc) { return parseSectionSwitch("__TEXT", "__static_const"); } bool parseSectionDirectiveCString(StringRef, SMLoc) { return parseSectionSwitch("__TEXT","__cstring", MachO::S_CSTRING_LITERALS); } bool parseSectionDirectiveLiteral4(StringRef, SMLoc) { return parseSectionSwitch("__TEXT", "__literal4", MachO::S_4BYTE_LITERALS, 4); } bool parseSectionDirectiveLiteral8(StringRef, SMLoc) { return parseSectionSwitch("__TEXT", "__literal8", MachO::S_8BYTE_LITERALS, 8); } bool parseSectionDirectiveLiteral16(StringRef, SMLoc) { return parseSectionSwitch("__TEXT","__literal16", MachO::S_16BYTE_LITERALS, 16); } bool parseSectionDirectiveConstructor(StringRef, SMLoc) { return parseSectionSwitch("__TEXT","__constructor"); } bool parseSectionDirectiveDestructor(StringRef, SMLoc) { return parseSectionSwitch("__TEXT","__destructor"); } bool parseSectionDirectiveFVMLibInit0(StringRef, SMLoc) { return parseSectionSwitch("__TEXT","__fvmlib_init0"); } bool parseSectionDirectiveFVMLibInit1(StringRef, SMLoc) { return parseSectionSwitch("__TEXT","__fvmlib_init1"); } bool parseSectionDirectiveSymbolStub(StringRef, SMLoc) { return parseSectionSwitch("__TEXT","__symbol_stub", MachO::S_SYMBOL_STUBS | MachO::S_ATTR_PURE_INSTRUCTIONS, // FIXME: Different on PPC and ARM. 0, 16); } bool parseSectionDirectivePICSymbolStub(StringRef, SMLoc) { return parseSectionSwitch("__TEXT","__picsymbol_stub", MachO::S_SYMBOL_STUBS | MachO::S_ATTR_PURE_INSTRUCTIONS, 0, 26); } bool parseSectionDirectiveData(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__data"); } bool parseSectionDirectiveStaticData(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__static_data"); } bool parseSectionDirectiveNonLazySymbolPointers(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__nl_symbol_ptr", MachO::S_NON_LAZY_SYMBOL_POINTERS, 4); } bool parseSectionDirectiveLazySymbolPointers(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__la_symbol_ptr", MachO::S_LAZY_SYMBOL_POINTERS, 4); } bool parseSectionDirectiveThreadLocalVariablePointers(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__thread_ptr", MachO::S_THREAD_LOCAL_VARIABLE_POINTERS, 4); } bool parseSectionDirectiveDyld(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__dyld"); } bool parseSectionDirectiveModInitFunc(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__mod_init_func", MachO::S_MOD_INIT_FUNC_POINTERS, 4); } bool parseSectionDirectiveModTermFunc(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__mod_term_func", MachO::S_MOD_TERM_FUNC_POINTERS, 4); } bool parseSectionDirectiveConstData(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__const"); } bool parseSectionDirectiveObjCClass(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__class", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCMetaClass(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__meta_class", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCCatClsMeth(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__cat_cls_meth", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCCatInstMeth(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__cat_inst_meth", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCProtocol(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__protocol", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCStringObject(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__string_object", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCClsMeth(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__cls_meth", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCInstMeth(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__inst_meth", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCClsRefs(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__cls_refs", MachO::S_ATTR_NO_DEAD_STRIP | MachO::S_LITERAL_POINTERS, 4); } bool parseSectionDirectiveObjCMessageRefs(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__message_refs", MachO::S_ATTR_NO_DEAD_STRIP | MachO::S_LITERAL_POINTERS, 4); } bool parseSectionDirectiveObjCSymbols(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__symbols", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCCategory(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__category", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCClassVars(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__class_vars", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCInstanceVars(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__instance_vars", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCModuleInfo(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__module_info", MachO::S_ATTR_NO_DEAD_STRIP); } bool parseSectionDirectiveObjCClassNames(StringRef, SMLoc) { return parseSectionSwitch("__TEXT", "__cstring", MachO::S_CSTRING_LITERALS); } bool parseSectionDirectiveObjCMethVarTypes(StringRef, SMLoc) { return parseSectionSwitch("__TEXT", "__cstring", MachO::S_CSTRING_LITERALS); } bool parseSectionDirectiveObjCMethVarNames(StringRef, SMLoc) { return parseSectionSwitch("__TEXT", "__cstring", MachO::S_CSTRING_LITERALS); } bool parseSectionDirectiveObjCSelectorStrs(StringRef, SMLoc) { return parseSectionSwitch("__OBJC", "__selector_strs", MachO::S_CSTRING_LITERALS); } bool parseSectionDirectiveTData(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__thread_data", MachO::S_THREAD_LOCAL_REGULAR); } bool parseSectionDirectiveText(StringRef, SMLoc) { return parseSectionSwitch("__TEXT", "__text", MachO::S_ATTR_PURE_INSTRUCTIONS); } bool parseSectionDirectiveTLV(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__thread_vars", MachO::S_THREAD_LOCAL_VARIABLES); } bool parseSectionDirectiveIdent(StringRef, SMLoc) { // Darwin silently ignores the .ident directive. getParser().eatToEndOfStatement(); return false; } bool parseSectionDirectiveThreadInitFunc(StringRef, SMLoc) { return parseSectionSwitch("__DATA", "__thread_init", MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS); } bool parseWatchOSVersionMin(StringRef Directive, SMLoc Loc) { return parseVersionMin(Directive, Loc, MCVM_WatchOSVersionMin); } bool parseTvOSVersionMin(StringRef Directive, SMLoc Loc) { return parseVersionMin(Directive, Loc, MCVM_TvOSVersionMin); } bool parseIOSVersionMin(StringRef Directive, SMLoc Loc) { return parseVersionMin(Directive, Loc, MCVM_IOSVersionMin); } bool parseMacOSXVersionMin(StringRef Directive, SMLoc Loc) { return parseVersionMin(Directive, Loc, MCVM_OSXVersionMin); } bool parseBuildVersion(StringRef Directive, SMLoc Loc); bool parseVersionMin(StringRef Directive, SMLoc Loc, MCVersionMinType Type); bool parseVersion(unsigned *Major, unsigned *Minor, unsigned *Update); void checkVersion(StringRef Directive, StringRef Arg, SMLoc Loc, Triple::OSType ExpectedOS); }; } // end anonymous namespace bool DarwinAsmParser::parseSectionSwitch(StringRef Segment, StringRef Section, unsigned TAA, unsigned Align, unsigned StubSize) { if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in section switching directive"); Lex(); // FIXME: Arch specific. bool isText = TAA & MachO::S_ATTR_PURE_INSTRUCTIONS; getStreamer().SwitchSection(getContext().getMachOSection( Segment, Section, TAA, StubSize, isText ? SectionKind::getText() : SectionKind::getData())); // Set the implicit alignment, if any. // // FIXME: This isn't really what 'as' does; I think it just uses the implicit // alignment on the section (e.g., if one manually inserts bytes into the // section, then just issuing the section switch directive will not realign // the section. However, this is arguably more reasonable behavior, and there // is no good reason for someone to intentionally emit incorrectly sized // values into the implicitly aligned sections. if (Align) getStreamer().EmitValueToAlignment(Align); return false; } /// parseDirectiveAltEntry /// ::= .alt_entry identifier bool DarwinAsmParser::parseDirectiveAltEntry(StringRef, SMLoc) { StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); // Look up symbol. MCSymbol *Sym = getContext().getOrCreateSymbol(Name); if (Sym->isDefined()) return TokError(".alt_entry must preceed symbol definition"); if (!getStreamer().EmitSymbolAttribute(Sym, MCSA_AltEntry)) return TokError("unable to emit symbol attribute"); Lex(); return false; } /// parseDirectiveDesc /// ::= .desc identifier , expression bool DarwinAsmParser::parseDirectiveDesc(StringRef, SMLoc) { StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); // Handle the identifier as the key symbol. MCSymbol *Sym = getContext().getOrCreateSymbol(Name); if (getLexer().isNot(AsmToken::Comma)) return TokError("unexpected token in '.desc' directive"); Lex(); int64_t DescValue; if (getParser().parseAbsoluteExpression(DescValue)) return true; if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.desc' directive"); Lex(); // Set the n_desc field of this Symbol to this DescValue getStreamer().EmitSymbolDesc(Sym, DescValue); return false; } /// parseDirectiveIndirectSymbol /// ::= .indirect_symbol identifier bool DarwinAsmParser::parseDirectiveIndirectSymbol(StringRef, SMLoc Loc) { const MCSectionMachO *Current = static_cast( getStreamer().getCurrentSectionOnly()); MachO::SectionType SectionType = Current->getType(); if (SectionType != MachO::S_NON_LAZY_SYMBOL_POINTERS && SectionType != MachO::S_LAZY_SYMBOL_POINTERS && SectionType != MachO::S_THREAD_LOCAL_VARIABLE_POINTERS && SectionType != MachO::S_SYMBOL_STUBS) return Error(Loc, "indirect symbol not in a symbol pointer or stub " "section"); StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in .indirect_symbol directive"); MCSymbol *Sym = getContext().getOrCreateSymbol(Name); // Assembler local symbols don't make any sense here. Complain loudly. if (Sym->isTemporary()) return TokError("non-local symbol required in directive"); if (!getStreamer().EmitSymbolAttribute(Sym, MCSA_IndirectSymbol)) return TokError("unable to emit indirect symbol attribute for: " + Name); if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.indirect_symbol' directive"); Lex(); return false; } /// parseDirectiveDumpOrLoad /// ::= ( .dump | .load ) "filename" bool DarwinAsmParser::parseDirectiveDumpOrLoad(StringRef Directive, SMLoc IDLoc) { bool IsDump = Directive == ".dump"; if (getLexer().isNot(AsmToken::String)) return TokError("expected string in '.dump' or '.load' directive"); Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.dump' or '.load' directive"); Lex(); // FIXME: If/when .dump and .load are implemented they will be done in the // the assembly parser and not have any need for an MCStreamer API. if (IsDump) return Warning(IDLoc, "ignoring directive .dump for now"); else return Warning(IDLoc, "ignoring directive .load for now"); } /// ParseDirectiveLinkerOption /// ::= .linker_option "string" ( , "string" )* bool DarwinAsmParser::parseDirectiveLinkerOption(StringRef IDVal, SMLoc) { SmallVector Args; while (true) { if (getLexer().isNot(AsmToken::String)) return TokError("expected string in '" + Twine(IDVal) + "' directive"); std::string Data; if (getParser().parseEscapedString(Data)) return true; Args.push_back(Data); if (getLexer().is(AsmToken::EndOfStatement)) break; if (getLexer().isNot(AsmToken::Comma)) return TokError("unexpected token in '" + Twine(IDVal) + "' directive"); Lex(); } getStreamer().EmitLinkerOptions(Args); return false; } /// parseDirectiveLsym /// ::= .lsym identifier , expression bool DarwinAsmParser::parseDirectiveLsym(StringRef, SMLoc) { StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); // Handle the identifier as the key symbol. MCSymbol *Sym = getContext().getOrCreateSymbol(Name); if (getLexer().isNot(AsmToken::Comma)) return TokError("unexpected token in '.lsym' directive"); Lex(); const MCExpr *Value; if (getParser().parseExpression(Value)) return true; if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.lsym' directive"); Lex(); // We don't currently support this directive. // // FIXME: Diagnostic location! (void) Sym; return TokError("directive '.lsym' is unsupported"); } /// parseDirectiveSection: /// ::= .section identifier (',' identifier)* bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) { SMLoc Loc = getLexer().getLoc(); StringRef SectionName; if (getParser().parseIdentifier(SectionName)) return Error(Loc, "expected identifier after '.section' directive"); // Verify there is a following comma. if (!getLexer().is(AsmToken::Comma)) return TokError("unexpected token in '.section' directive"); std::string SectionSpec = SectionName; SectionSpec += ","; // Add all the tokens until the end of the line, ParseSectionSpecifier will // handle this. StringRef EOL = getLexer().LexUntilEndOfStatement(); SectionSpec.append(EOL.begin(), EOL.end()); Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.section' directive"); Lex(); StringRef Segment, Section; unsigned StubSize; unsigned TAA; bool TAAParsed; std::string ErrorStr = MCSectionMachO::ParseSectionSpecifier(SectionSpec, Segment, Section, TAA, TAAParsed, StubSize); if (!ErrorStr.empty()) return Error(Loc, ErrorStr); // Issue a warning if the target is not powerpc and Section is a *coal* section. Triple TT = getParser().getContext().getObjectFileInfo()->getTargetTriple(); Triple::ArchType ArchTy = TT.getArch(); if (ArchTy != Triple::ppc && ArchTy != Triple::ppc64) { StringRef NonCoalSection = StringSwitch(Section) .Case("__textcoal_nt", "__text") .Case("__const_coal", "__const") .Case("__datacoal_nt", "__data") .Default(Section); if (!Section.equals(NonCoalSection)) { StringRef SectionVal(Loc.getPointer()); size_t B = SectionVal.find(',') + 1, E = SectionVal.find(',', B); SMLoc BLoc = SMLoc::getFromPointer(SectionVal.data() + B); SMLoc ELoc = SMLoc::getFromPointer(SectionVal.data() + E); getParser().Warning(Loc, "section \"" + Section + "\" is deprecated", SMRange(BLoc, ELoc)); getParser().Note(Loc, "change section name to \"" + NonCoalSection + "\"", SMRange(BLoc, ELoc)); } } // FIXME: Arch specific. bool isText = Segment == "__TEXT"; // FIXME: Hack. getStreamer().SwitchSection(getContext().getMachOSection( Segment, Section, TAA, StubSize, isText ? SectionKind::getText() : SectionKind::getData())); return false; } /// ParseDirectivePushSection: /// ::= .pushsection identifier (',' identifier)* bool DarwinAsmParser::parseDirectivePushSection(StringRef S, SMLoc Loc) { getStreamer().PushSection(); if (parseDirectiveSection(S, Loc)) { getStreamer().PopSection(); return true; } return false; } /// ParseDirectivePopSection: /// ::= .popsection bool DarwinAsmParser::parseDirectivePopSection(StringRef, SMLoc) { if (!getStreamer().PopSection()) return TokError(".popsection without corresponding .pushsection"); return false; } /// ParseDirectivePrevious: /// ::= .previous bool DarwinAsmParser::parseDirectivePrevious(StringRef DirName, SMLoc) { MCSectionSubPair PreviousSection = getStreamer().getPreviousSection(); if (!PreviousSection.first) return TokError(".previous without corresponding .section"); getStreamer().SwitchSection(PreviousSection.first, PreviousSection.second); return false; } /// ParseDirectiveSecureLogUnique /// ::= .secure_log_unique ... message ... bool DarwinAsmParser::parseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) { StringRef LogMessage = getParser().parseStringToEndOfStatement(); if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.secure_log_unique' directive"); if (getContext().getSecureLogUsed()) return Error(IDLoc, ".secure_log_unique specified multiple times"); // Get the secure log path. const char *SecureLogFile = getContext().getSecureLogFile(); if (!SecureLogFile) return Error(IDLoc, ".secure_log_unique used but AS_SECURE_LOG_FILE " "environment variable unset."); // Open the secure log file if we haven't already. raw_fd_ostream *OS = getContext().getSecureLog(); if (!OS) { std::error_code EC; auto NewOS = llvm::make_unique( StringRef(SecureLogFile), EC, sys::fs::F_Append | sys::fs::F_Text); if (EC) return Error(IDLoc, Twine("can't open secure log file: ") + SecureLogFile + " (" + EC.message() + ")"); OS = NewOS.get(); getContext().setSecureLog(std::move(NewOS)); } // Write the message. unsigned CurBuf = getSourceManager().FindBufferContainingLoc(IDLoc); *OS << getSourceManager().getBufferInfo(CurBuf).Buffer->getBufferIdentifier() << ":" << getSourceManager().FindLineNumber(IDLoc, CurBuf) << ":" << LogMessage + "\n"; getContext().setSecureLogUsed(true); return false; } /// ParseDirectiveSecureLogReset /// ::= .secure_log_reset bool DarwinAsmParser::parseDirectiveSecureLogReset(StringRef, SMLoc IDLoc) { if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.secure_log_reset' directive"); Lex(); getContext().setSecureLogUsed(false); return false; } /// parseDirectiveSubsectionsViaSymbols /// ::= .subsections_via_symbols bool DarwinAsmParser::parseDirectiveSubsectionsViaSymbols(StringRef, SMLoc) { if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.subsections_via_symbols' directive"); Lex(); getStreamer().EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); return false; } /// ParseDirectiveTBSS /// ::= .tbss identifier, size, align bool DarwinAsmParser::parseDirectiveTBSS(StringRef, SMLoc) { SMLoc IDLoc = getLexer().getLoc(); StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); // Handle the identifier as the key symbol. MCSymbol *Sym = getContext().getOrCreateSymbol(Name); if (getLexer().isNot(AsmToken::Comma)) return TokError("unexpected token in directive"); Lex(); int64_t Size; SMLoc SizeLoc = getLexer().getLoc(); if (getParser().parseAbsoluteExpression(Size)) return true; int64_t Pow2Alignment = 0; SMLoc Pow2AlignmentLoc; if (getLexer().is(AsmToken::Comma)) { Lex(); Pow2AlignmentLoc = getLexer().getLoc(); if (getParser().parseAbsoluteExpression(Pow2Alignment)) return true; } if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.tbss' directive"); Lex(); if (Size < 0) return Error(SizeLoc, "invalid '.tbss' directive size, can't be less than" "zero"); // FIXME: Diagnose overflow. if (Pow2Alignment < 0) return Error(Pow2AlignmentLoc, "invalid '.tbss' alignment, can't be less" "than zero"); if (!Sym->isUndefined()) return Error(IDLoc, "invalid symbol redefinition"); getStreamer().EmitTBSSSymbol(getContext().getMachOSection( "__DATA", "__thread_bss", MachO::S_THREAD_LOCAL_ZEROFILL, 0, SectionKind::getThreadBSS()), Sym, Size, 1 << Pow2Alignment); return false; } /// ParseDirectiveZerofill /// ::= .zerofill segname , sectname [, identifier , size_expression [ /// , align_expression ]] bool DarwinAsmParser::parseDirectiveZerofill(StringRef, SMLoc) { StringRef Segment; if (getParser().parseIdentifier(Segment)) return TokError("expected segment name after '.zerofill' directive"); if (getLexer().isNot(AsmToken::Comma)) return TokError("unexpected token in directive"); Lex(); StringRef Section; if (getParser().parseIdentifier(Section)) return TokError("expected section name after comma in '.zerofill' " "directive"); // If this is the end of the line all that was wanted was to create the // the section but with no symbol. if (getLexer().is(AsmToken::EndOfStatement)) { // Create the zerofill section but no symbol getStreamer().EmitZerofill(getContext().getMachOSection( Segment, Section, MachO::S_ZEROFILL, 0, SectionKind::getBSS())); return false; } if (getLexer().isNot(AsmToken::Comma)) return TokError("unexpected token in directive"); Lex(); SMLoc IDLoc = getLexer().getLoc(); StringRef IDStr; if (getParser().parseIdentifier(IDStr)) return TokError("expected identifier in directive"); // handle the identifier as the key symbol. MCSymbol *Sym = getContext().getOrCreateSymbol(IDStr); if (getLexer().isNot(AsmToken::Comma)) return TokError("unexpected token in directive"); Lex(); int64_t Size; SMLoc SizeLoc = getLexer().getLoc(); if (getParser().parseAbsoluteExpression(Size)) return true; int64_t Pow2Alignment = 0; SMLoc Pow2AlignmentLoc; if (getLexer().is(AsmToken::Comma)) { Lex(); Pow2AlignmentLoc = getLexer().getLoc(); if (getParser().parseAbsoluteExpression(Pow2Alignment)) return true; } if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.zerofill' directive"); Lex(); if (Size < 0) return Error(SizeLoc, "invalid '.zerofill' directive size, can't be less " "than zero"); // NOTE: The alignment in the directive is a power of 2 value, the assembler // may internally end up wanting an alignment in bytes. // FIXME: Diagnose overflow. if (Pow2Alignment < 0) return Error(Pow2AlignmentLoc, "invalid '.zerofill' directive alignment, " "can't be less than zero"); if (!Sym->isUndefined()) return Error(IDLoc, "invalid symbol redefinition"); // Create the zerofill Symbol with Size and Pow2Alignment // // FIXME: Arch specific. getStreamer().EmitZerofill(getContext().getMachOSection( Segment, Section, MachO::S_ZEROFILL, 0, SectionKind::getBSS()), Sym, Size, 1 << Pow2Alignment); return false; } /// ParseDirectiveDataRegion /// ::= .data_region [ ( jt8 | jt16 | jt32 ) ] bool DarwinAsmParser::parseDirectiveDataRegion(StringRef, SMLoc) { if (getLexer().is(AsmToken::EndOfStatement)) { Lex(); getStreamer().EmitDataRegion(MCDR_DataRegion); return false; } StringRef RegionType; SMLoc Loc = getParser().getTok().getLoc(); if (getParser().parseIdentifier(RegionType)) return TokError("expected region type after '.data_region' directive"); int Kind = StringSwitch(RegionType) .Case("jt8", MCDR_DataRegionJT8) .Case("jt16", MCDR_DataRegionJT16) .Case("jt32", MCDR_DataRegionJT32) .Default(-1); if (Kind == -1) return Error(Loc, "unknown region type in '.data_region' directive"); Lex(); getStreamer().EmitDataRegion((MCDataRegionType)Kind); return false; } /// ParseDirectiveDataRegionEnd /// ::= .end_data_region bool DarwinAsmParser::parseDirectiveDataRegionEnd(StringRef, SMLoc) { if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.end_data_region' directive"); Lex(); getStreamer().EmitDataRegion(MCDR_DataRegionEnd); return false; } /// parseVersion ::= major, minor [, update] bool DarwinAsmParser::parseVersion(unsigned *Major, unsigned *Minor, unsigned *Update) { // Get the major version number. if (getLexer().isNot(AsmToken::Integer)) return TokError("invalid OS major version number, integer expected"); int64_t MajorVal = getLexer().getTok().getIntVal(); if (MajorVal > 65535 || MajorVal <= 0) return TokError("invalid OS major version number"); *Major = (unsigned)MajorVal; Lex(); if (getLexer().isNot(AsmToken::Comma)) return TokError("OS minor version number required, comma expected"); Lex(); // Get the minor version number. if (getLexer().isNot(AsmToken::Integer)) return TokError("invalid OS minor version number, integer expected"); int64_t MinorVal = getLexer().getTok().getIntVal(); if (MinorVal > 255 || MinorVal < 0) return TokError("invalid OS minor version number"); *Minor = MinorVal; Lex(); // Get the update level, if specified *Update = 0; if (getLexer().is(AsmToken::EndOfStatement)) return false; if (getLexer().isNot(AsmToken::Comma)) return TokError("invalid OS update specifier, comma expected"); Lex(); if (getLexer().isNot(AsmToken::Integer)) return TokError("invalid OS update version number, integer expected"); int64_t UpdateVal = getLexer().getTok().getIntVal(); if (UpdateVal > 255 || UpdateVal < 0) return TokError("invalid OS update version number"); *Update = UpdateVal; Lex(); return false; } void DarwinAsmParser::checkVersion(StringRef Directive, StringRef Arg, SMLoc Loc, Triple::OSType ExpectedOS) { const Triple &Target = getContext().getObjectFileInfo()->getTargetTriple(); if (Target.getOS() != ExpectedOS) Warning(Loc, Twine(Directive) + (Arg.empty() ? Twine() : Twine(' ') + Arg) + " used while targeting " + Target.getOSName()); if (LastVersionDirective.isValid()) { Warning(Loc, "overriding previous version directive"); Note(LastVersionDirective, "previous definition is here"); } LastVersionDirective = Loc; } static Triple::OSType getOSTypeFromMCVM(MCVersionMinType Type) { switch (Type) { case MCVM_WatchOSVersionMin: return Triple::WatchOS; case MCVM_TvOSVersionMin: return Triple::TvOS; case MCVM_IOSVersionMin: return Triple::IOS; case MCVM_OSXVersionMin: return Triple::MacOSX; } llvm_unreachable("Invalid mc version min type"); } /// parseVersionMin /// ::= .ios_version_min parseVersion /// | .macosx_version_min parseVersion /// | .tvos_version_min parseVersion /// | .watchos_version_min parseVersion bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc, MCVersionMinType Type) { unsigned Major; unsigned Minor; unsigned Update; if (parseVersion(&Major, &Minor, &Update)) return true; if (parseToken(AsmToken::EndOfStatement)) return addErrorSuffix(Twine(" in '") + Directive + "' directive"); Triple::OSType ExpectedOS = getOSTypeFromMCVM(Type); checkVersion(Directive, StringRef(), Loc, ExpectedOS); getStreamer().EmitVersionMin(Type, Major, Minor, Update); return false; } -Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) { +static Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) { switch (Type) { case MachO::PLATFORM_MACOS: return Triple::MacOSX; case MachO::PLATFORM_IOS: return Triple::IOS; case MachO::PLATFORM_TVOS: return Triple::TvOS; case MachO::PLATFORM_WATCHOS: return Triple::WatchOS; case MachO::PLATFORM_BRIDGEOS: /* silence warning */break; } llvm_unreachable("Invalid mach-o platform type"); } /// parseBuildVersion /// ::= .build_version (macos|ios|tvos|watchos), parseVersion bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) { StringRef PlatformName; SMLoc PlatformLoc = getTok().getLoc(); if (getParser().parseIdentifier(PlatformName)) return TokError("platform name expected"); unsigned Platform = StringSwitch(PlatformName) .Case("macos", MachO::PLATFORM_MACOS) .Case("ios", MachO::PLATFORM_IOS) .Case("tvos", MachO::PLATFORM_TVOS) .Case("watchos", MachO::PLATFORM_WATCHOS) .Default(0); if (Platform == 0) return Error(PlatformLoc, "unknown platform name"); if (getLexer().isNot(AsmToken::Comma)) return TokError("version number required, comma expected"); Lex(); unsigned Major; unsigned Minor; unsigned Update; if (parseVersion(&Major, &Minor, &Update)) return true; if (parseToken(AsmToken::EndOfStatement)) return addErrorSuffix(" in '.build_version' directive"); Triple::OSType ExpectedOS = getOSTypeFromPlatform((MachO::PlatformType)Platform); checkVersion(Directive, PlatformName, Loc, ExpectedOS); getStreamer().EmitBuildVersion(Platform, Major, Minor, Update); return false; } namespace llvm { MCAsmParserExtension *createDarwinAsmParser() { return new DarwinAsmParser; } } // end llvm namespace Index: vendor/llvm/dist/lib/MC/MCParser/ELFAsmParser.cpp =================================================================== --- vendor/llvm/dist/lib/MC/MCParser/ELFAsmParser.cpp (revision 327151) +++ vendor/llvm/dist/lib/MC/MCParser/ELFAsmParser.cpp (revision 327152) @@ -1,847 +1,851 @@ //===- ELFAsmParser.cpp - ELF Assembly Parser -----------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCAsmParserExtension.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/SectionKind.h" #include "llvm/Support/Casting.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SMLoc.h" #include #include #include using namespace llvm; namespace { class ELFAsmParser : public MCAsmParserExtension { template void addDirectiveHandler(StringRef Directive) { MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair( this, HandleDirective); getParser().addDirectiveHandler(Directive, Handler); } bool ParseSectionSwitch(StringRef Section, unsigned Type, unsigned Flags, SectionKind Kind); public: ELFAsmParser() { BracketExpressionsSupported = true; } void Initialize(MCAsmParser &Parser) override { // Call the base implementation. this->MCAsmParserExtension::Initialize(Parser); addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveData>(".data"); addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveText>(".text"); addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveBSS>(".bss"); addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveRoData>(".rodata"); addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTData>(".tdata"); addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTBSS>(".tbss"); addDirectiveHandler< &ELFAsmParser::ParseSectionDirectiveDataRel>(".data.rel"); addDirectiveHandler< &ELFAsmParser::ParseSectionDirectiveDataRelRo>(".data.rel.ro"); addDirectiveHandler< &ELFAsmParser::ParseSectionDirectiveEhFrame>(".eh_frame"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveSection>(".section"); addDirectiveHandler< &ELFAsmParser::ParseDirectivePushSection>(".pushsection"); addDirectiveHandler<&ELFAsmParser::ParseDirectivePopSection>(".popsection"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveSize>(".size"); addDirectiveHandler<&ELFAsmParser::ParseDirectivePrevious>(".previous"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveType>(".type"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveIdent>(".ident"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymver>(".symver"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveVersion>(".version"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveWeakref>(".weakref"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".weak"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".local"); addDirectiveHandler< &ELFAsmParser::ParseDirectiveSymbolAttribute>(".protected"); addDirectiveHandler< &ELFAsmParser::ParseDirectiveSymbolAttribute>(".internal"); addDirectiveHandler< &ELFAsmParser::ParseDirectiveSymbolAttribute>(".hidden"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveSubsection>(".subsection"); } // FIXME: Part of this logic is duplicated in the MCELFStreamer. What is // the best way for us to get access to it? bool ParseSectionDirectiveData(StringRef, SMLoc) { return ParseSectionSwitch(".data", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC, SectionKind::getData()); } bool ParseSectionDirectiveText(StringRef, SMLoc) { return ParseSectionSwitch(".text", ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC, SectionKind::getText()); } bool ParseSectionDirectiveBSS(StringRef, SMLoc) { return ParseSectionSwitch(".bss", ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC, SectionKind::getBSS()); } bool ParseSectionDirectiveRoData(StringRef, SMLoc) { return ParseSectionSwitch(".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC, SectionKind::getReadOnly()); } bool ParseSectionDirectiveTData(StringRef, SMLoc) { return ParseSectionSwitch(".tdata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_TLS | ELF::SHF_WRITE, SectionKind::getThreadData()); } bool ParseSectionDirectiveTBSS(StringRef, SMLoc) { return ParseSectionSwitch(".tbss", ELF::SHT_NOBITS, ELF::SHF_ALLOC | ELF::SHF_TLS | ELF::SHF_WRITE, SectionKind::getThreadBSS()); } bool ParseSectionDirectiveDataRel(StringRef, SMLoc) { return ParseSectionSwitch(".data.rel", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE, SectionKind::getData()); } bool ParseSectionDirectiveDataRelRo(StringRef, SMLoc) { return ParseSectionSwitch(".data.rel.ro", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE, SectionKind::getReadOnlyWithRel()); } bool ParseSectionDirectiveEhFrame(StringRef, SMLoc) { return ParseSectionSwitch(".eh_frame", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE, SectionKind::getData()); } bool ParseDirectivePushSection(StringRef, SMLoc); bool ParseDirectivePopSection(StringRef, SMLoc); bool ParseDirectiveSection(StringRef, SMLoc); bool ParseDirectiveSize(StringRef, SMLoc); bool ParseDirectivePrevious(StringRef, SMLoc); bool ParseDirectiveType(StringRef, SMLoc); bool ParseDirectiveIdent(StringRef, SMLoc); bool ParseDirectiveSymver(StringRef, SMLoc); bool ParseDirectiveVersion(StringRef, SMLoc); bool ParseDirectiveWeakref(StringRef, SMLoc); bool ParseDirectiveSymbolAttribute(StringRef, SMLoc); bool ParseDirectiveSubsection(StringRef, SMLoc); private: bool ParseSectionName(StringRef &SectionName); bool ParseSectionArguments(bool IsPush, SMLoc loc); unsigned parseSunStyleSectionFlags(); bool maybeParseSectionType(StringRef &TypeName); bool parseMergeSize(int64_t &Size); bool parseGroup(StringRef &GroupName); bool parseMetadataSym(MCSymbolELF *&Associated); bool maybeParseUniqueID(int64_t &UniqueID); }; } // end anonymous namespace /// ParseDirectiveSymbolAttribute /// ::= { ".local", ".weak", ... } [ identifier ( , identifier )* ] bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) { MCSymbolAttr Attr = StringSwitch(Directive) .Case(".weak", MCSA_Weak) .Case(".local", MCSA_Local) .Case(".hidden", MCSA_Hidden) .Case(".internal", MCSA_Internal) .Case(".protected", MCSA_Protected) .Default(MCSA_Invalid); assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!"); if (getLexer().isNot(AsmToken::EndOfStatement)) { while (true) { StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); MCSymbol *Sym = getContext().getOrCreateSymbol(Name); getStreamer().EmitSymbolAttribute(Sym, Attr); if (getLexer().is(AsmToken::EndOfStatement)) break; if (getLexer().isNot(AsmToken::Comma)) return TokError("unexpected token in directive"); Lex(); } } Lex(); return false; } bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type, unsigned Flags, SectionKind Kind) { const MCExpr *Subsection = nullptr; if (getLexer().isNot(AsmToken::EndOfStatement)) { if (getParser().parseExpression(Subsection)) return true; } Lex(); getStreamer().SwitchSection(getContext().getELFSection(Section, Type, Flags), Subsection); return false; } bool ELFAsmParser::ParseDirectiveSize(StringRef, SMLoc) { StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); MCSymbolELF *Sym = cast(getContext().getOrCreateSymbol(Name)); if (getLexer().isNot(AsmToken::Comma)) return TokError("unexpected token in directive"); Lex(); const MCExpr *Expr; if (getParser().parseExpression(Expr)) return true; if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in directive"); Lex(); getStreamer().emitELFSize(Sym, Expr); return false; } bool ELFAsmParser::ParseSectionName(StringRef &SectionName) { // A section name can contain -, so we cannot just use // parseIdentifier. SMLoc FirstLoc = getLexer().getLoc(); unsigned Size = 0; if (getLexer().is(AsmToken::String)) { SectionName = getTok().getIdentifier(); Lex(); return false; } while (!getParser().hasPendingError()) { SMLoc PrevLoc = getLexer().getLoc(); if (getLexer().is(AsmToken::Comma) || getLexer().is(AsmToken::EndOfStatement)) break; unsigned CurSize; if (getLexer().is(AsmToken::String)) { CurSize = getTok().getIdentifier().size() + 2; Lex(); } else if (getLexer().is(AsmToken::Identifier)) { CurSize = getTok().getIdentifier().size(); Lex(); } else { CurSize = getTok().getString().size(); Lex(); } Size += CurSize; SectionName = StringRef(FirstLoc.getPointer(), Size); // Make sure the following token is adjacent. if (PrevLoc.getPointer() + CurSize != getTok().getLoc().getPointer()) break; } if (Size == 0) return true; return false; } static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) { unsigned flags = 0; // If a valid numerical value is set for the section flag, use it verbatim if (!flagsStr.getAsInteger(0, flags)) return flags; for (char i : flagsStr) { switch (i) { case 'a': flags |= ELF::SHF_ALLOC; break; case 'e': flags |= ELF::SHF_EXCLUDE; break; case 'x': flags |= ELF::SHF_EXECINSTR; break; case 'w': flags |= ELF::SHF_WRITE; break; case 'o': flags |= ELF::SHF_LINK_ORDER; break; case 'M': flags |= ELF::SHF_MERGE; break; case 'S': flags |= ELF::SHF_STRINGS; break; case 'T': flags |= ELF::SHF_TLS; break; case 'c': flags |= ELF::XCORE_SHF_CP_SECTION; break; case 'd': flags |= ELF::XCORE_SHF_DP_SECTION; break; case 'y': flags |= ELF::SHF_ARM_PURECODE; break; case 'G': flags |= ELF::SHF_GROUP; break; case '?': *UseLastGroup = true; break; default: return -1U; } } return flags; } unsigned ELFAsmParser::parseSunStyleSectionFlags() { unsigned flags = 0; while (getLexer().is(AsmToken::Hash)) { Lex(); // Eat the #. if (!getLexer().is(AsmToken::Identifier)) return -1U; StringRef flagId = getTok().getIdentifier(); if (flagId == "alloc") flags |= ELF::SHF_ALLOC; else if (flagId == "execinstr") flags |= ELF::SHF_EXECINSTR; else if (flagId == "write") flags |= ELF::SHF_WRITE; else if (flagId == "tls") flags |= ELF::SHF_TLS; else return -1U; Lex(); // Eat the flag. if (!getLexer().is(AsmToken::Comma)) break; Lex(); // Eat the comma. } return flags; } bool ELFAsmParser::ParseDirectivePushSection(StringRef s, SMLoc loc) { getStreamer().PushSection(); if (ParseSectionArguments(/*IsPush=*/true, loc)) { getStreamer().PopSection(); return true; } return false; } bool ELFAsmParser::ParseDirectivePopSection(StringRef, SMLoc) { if (!getStreamer().PopSection()) return TokError(".popsection without corresponding .pushsection"); return false; } // FIXME: This is a work in progress. bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc loc) { return ParseSectionArguments(/*IsPush=*/false, loc); } bool ELFAsmParser::maybeParseSectionType(StringRef &TypeName) { MCAsmLexer &L = getLexer(); if (L.isNot(AsmToken::Comma)) return false; Lex(); if (L.isNot(AsmToken::At) && L.isNot(AsmToken::Percent) && L.isNot(AsmToken::String)) { if (L.getAllowAtInIdentifier()) return TokError("expected '@', '%' or \"\""); else return TokError("expected '%' or \"\""); } if (!L.is(AsmToken::String)) Lex(); if (L.is(AsmToken::Integer)) { TypeName = getTok().getString(); Lex(); } else if (getParser().parseIdentifier(TypeName)) return TokError("expected identifier in directive"); return false; } bool ELFAsmParser::parseMergeSize(int64_t &Size) { if (getLexer().isNot(AsmToken::Comma)) return TokError("expected the entry size"); Lex(); if (getParser().parseAbsoluteExpression(Size)) return true; if (Size <= 0) return TokError("entry size must be positive"); return false; } bool ELFAsmParser::parseGroup(StringRef &GroupName) { MCAsmLexer &L = getLexer(); if (L.isNot(AsmToken::Comma)) return TokError("expected group name"); Lex(); - if (getParser().parseIdentifier(GroupName)) + if (L.is(AsmToken::Integer)) { + GroupName = getTok().getString(); + Lex(); + } else if (getParser().parseIdentifier(GroupName)) { return true; + } if (L.is(AsmToken::Comma)) { Lex(); StringRef Linkage; if (getParser().parseIdentifier(Linkage)) return true; if (Linkage != "comdat") return TokError("Linkage must be 'comdat'"); } return false; } bool ELFAsmParser::parseMetadataSym(MCSymbolELF *&Associated) { MCAsmLexer &L = getLexer(); if (L.isNot(AsmToken::Comma)) return TokError("expected metadata symbol"); Lex(); StringRef Name; if (getParser().parseIdentifier(Name)) return true; Associated = dyn_cast_or_null(getContext().lookupSymbol(Name)); if (!Associated || !Associated->isInSection()) return TokError("symbol is not in a section: " + Name); return false; } bool ELFAsmParser::maybeParseUniqueID(int64_t &UniqueID) { MCAsmLexer &L = getLexer(); if (L.isNot(AsmToken::Comma)) return false; Lex(); StringRef UniqueStr; if (getParser().parseIdentifier(UniqueStr)) return TokError("expected identifier in directive"); if (UniqueStr != "unique") return TokError("expected 'unique'"); if (L.isNot(AsmToken::Comma)) return TokError("expected commma"); Lex(); if (getParser().parseAbsoluteExpression(UniqueID)) return true; if (UniqueID < 0) return TokError("unique id must be positive"); if (!isUInt<32>(UniqueID) || UniqueID == ~0U) return TokError("unique id is too large"); return false; } static bool hasPrefix(StringRef SectionName, StringRef Prefix) { return SectionName.startswith(Prefix) || SectionName == Prefix.drop_back(); } bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) { StringRef SectionName; if (ParseSectionName(SectionName)) return TokError("expected identifier in directive"); StringRef TypeName; int64_t Size = 0; StringRef GroupName; unsigned Flags = 0; const MCExpr *Subsection = nullptr; bool UseLastGroup = false; MCSymbolELF *Associated = nullptr; int64_t UniqueID = ~0; // Set the defaults first. if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1") Flags |= ELF::SHF_ALLOC; if (SectionName == ".fini" || SectionName == ".init" || hasPrefix(SectionName, ".text.")) Flags |= ELF::SHF_ALLOC | ELF::SHF_EXECINSTR; if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" || hasPrefix(SectionName, ".bss.") || hasPrefix(SectionName, ".init_array.") || hasPrefix(SectionName, ".fini_array.") || hasPrefix(SectionName, ".preinit_array.")) Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE; if (hasPrefix(SectionName, ".tdata.") || hasPrefix(SectionName, ".tbss.")) Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS; if (getLexer().is(AsmToken::Comma)) { Lex(); if (IsPush && getLexer().isNot(AsmToken::String)) { if (getParser().parseExpression(Subsection)) return true; if (getLexer().isNot(AsmToken::Comma)) goto EndStmt; Lex(); } unsigned extraFlags; if (getLexer().isNot(AsmToken::String)) { if (!getContext().getAsmInfo()->usesSunStyleELFSectionSwitchSyntax() || getLexer().isNot(AsmToken::Hash)) return TokError("expected string in directive"); extraFlags = parseSunStyleSectionFlags(); } else { StringRef FlagsStr = getTok().getStringContents(); Lex(); extraFlags = parseSectionFlags(FlagsStr, &UseLastGroup); } if (extraFlags == -1U) return TokError("unknown flag"); Flags |= extraFlags; bool Mergeable = Flags & ELF::SHF_MERGE; bool Group = Flags & ELF::SHF_GROUP; if (Group && UseLastGroup) return TokError("Section cannot specifiy a group name while also acting " "as a member of the last group"); if (maybeParseSectionType(TypeName)) return true; MCAsmLexer &L = getLexer(); if (TypeName.empty()) { if (Mergeable) return TokError("Mergeable section must specify the type"); if (Group) return TokError("Group section must specify the type"); if (L.isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in directive"); } if (Mergeable) if (parseMergeSize(Size)) return true; if (Group) if (parseGroup(GroupName)) return true; if (Flags & ELF::SHF_LINK_ORDER) if (parseMetadataSym(Associated)) return true; if (maybeParseUniqueID(UniqueID)) return true; } EndStmt: if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in directive"); Lex(); unsigned Type = ELF::SHT_PROGBITS; if (TypeName.empty()) { if (SectionName.startswith(".note")) Type = ELF::SHT_NOTE; else if (hasPrefix(SectionName, ".init_array.")) Type = ELF::SHT_INIT_ARRAY; else if (hasPrefix(SectionName, ".bss.")) Type = ELF::SHT_NOBITS; else if (hasPrefix(SectionName, ".tbss.")) Type = ELF::SHT_NOBITS; else if (hasPrefix(SectionName, ".fini_array.")) Type = ELF::SHT_FINI_ARRAY; else if (hasPrefix(SectionName, ".preinit_array.")) Type = ELF::SHT_PREINIT_ARRAY; } else { if (TypeName == "init_array") Type = ELF::SHT_INIT_ARRAY; else if (TypeName == "fini_array") Type = ELF::SHT_FINI_ARRAY; else if (TypeName == "preinit_array") Type = ELF::SHT_PREINIT_ARRAY; else if (TypeName == "nobits") Type = ELF::SHT_NOBITS; else if (TypeName == "progbits") Type = ELF::SHT_PROGBITS; else if (TypeName == "note") Type = ELF::SHT_NOTE; else if (TypeName == "unwind") Type = ELF::SHT_X86_64_UNWIND; else if (TypeName == "llvm_odrtab") Type = ELF::SHT_LLVM_ODRTAB; else if (TypeName.getAsInteger(0, Type)) return TokError("unknown section type"); } if (UseLastGroup) { MCSectionSubPair CurrentSection = getStreamer().getCurrentSection(); if (const MCSectionELF *Section = cast_or_null(CurrentSection.first)) if (const MCSymbol *Group = Section->getGroup()) { GroupName = Group->getName(); Flags |= ELF::SHF_GROUP; } } MCSection *ELFSection = getContext().getELFSection(SectionName, Type, Flags, Size, GroupName, UniqueID, Associated); getStreamer().SwitchSection(ELFSection, Subsection); if (getContext().getGenDwarfForAssembly()) { bool InsertResult = getContext().addGenDwarfSection(ELFSection); if (InsertResult) { if (getContext().getDwarfVersion() <= 2) Warning(loc, "DWARF2 only supports one section per compilation unit"); if (!ELFSection->getBeginSymbol()) { MCSymbol *SectionStartSymbol = getContext().createTempSymbol(); getStreamer().EmitLabel(SectionStartSymbol); ELFSection->setBeginSymbol(SectionStartSymbol); } } } return false; } bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) { MCSectionSubPair PreviousSection = getStreamer().getPreviousSection(); if (PreviousSection.first == nullptr) return TokError(".previous without corresponding .section"); getStreamer().SwitchSection(PreviousSection.first, PreviousSection.second); return false; } static MCSymbolAttr MCAttrForString(StringRef Type) { return StringSwitch(Type) .Cases("STT_FUNC", "function", MCSA_ELF_TypeFunction) .Cases("STT_OBJECT", "object", MCSA_ELF_TypeObject) .Cases("STT_TLS", "tls_object", MCSA_ELF_TypeTLS) .Cases("STT_COMMON", "common", MCSA_ELF_TypeCommon) .Cases("STT_NOTYPE", "notype", MCSA_ELF_TypeNoType) .Cases("STT_GNU_IFUNC", "gnu_indirect_function", MCSA_ELF_TypeIndFunction) .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject) .Default(MCSA_Invalid); } /// ParseDirectiveELFType /// ::= .type identifier , STT_ /// ::= .type identifier , #attribute /// ::= .type identifier , @attribute /// ::= .type identifier , %attribute /// ::= .type identifier , "attribute" bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) { StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); // Handle the identifier as the key symbol. MCSymbol *Sym = getContext().getOrCreateSymbol(Name); // NOTE the comma is optional in all cases. It is only documented as being // optional in the first case, however, GAS will silently treat the comma as // optional in all cases. Furthermore, although the documentation states that // the first form only accepts STT_, in reality, GAS // accepts both the upper case name as well as the lower case aliases. if (getLexer().is(AsmToken::Comma)) Lex(); if (getLexer().isNot(AsmToken::Identifier) && getLexer().isNot(AsmToken::Hash) && getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::String)) { if (!getLexer().getAllowAtInIdentifier()) return TokError("expected STT_, '#', " "'%' or \"\""); else if (getLexer().isNot(AsmToken::At)) return TokError("expected STT_, '#', '@', " "'%' or \"\""); } if (getLexer().isNot(AsmToken::String) && getLexer().isNot(AsmToken::Identifier)) Lex(); SMLoc TypeLoc = getLexer().getLoc(); StringRef Type; if (getParser().parseIdentifier(Type)) return TokError("expected symbol type in directive"); MCSymbolAttr Attr = MCAttrForString(Type); if (Attr == MCSA_Invalid) return Error(TypeLoc, "unsupported attribute in '.type' directive"); if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.type' directive"); Lex(); getStreamer().EmitSymbolAttribute(Sym, Attr); return false; } /// ParseDirectiveIdent /// ::= .ident string bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) { if (getLexer().isNot(AsmToken::String)) return TokError("unexpected token in '.ident' directive"); StringRef Data = getTok().getIdentifier(); Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '.ident' directive"); Lex(); getStreamer().EmitIdent(Data); return false; } /// ParseDirectiveSymver /// ::= .symver foo, bar2@zed bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) { StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); if (getLexer().isNot(AsmToken::Comma)) return TokError("expected a comma"); // ARM assembly uses @ for a comment... // except when parsing the second parameter of the .symver directive. // Force the next symbol to allow @ in the identifier, which is // required for this directive and then reset it to its initial state. const bool AllowAtInIdentifier = getLexer().getAllowAtInIdentifier(); getLexer().setAllowAtInIdentifier(true); Lex(); getLexer().setAllowAtInIdentifier(AllowAtInIdentifier); StringRef AliasName; if (getParser().parseIdentifier(AliasName)) return TokError("expected identifier in directive"); if (AliasName.find('@') == StringRef::npos) return TokError("expected a '@' in the name"); MCSymbol *Alias = getContext().getOrCreateSymbol(AliasName); MCSymbol *Sym = getContext().getOrCreateSymbol(Name); const MCExpr *Value = MCSymbolRefExpr::create(Sym, getContext()); getStreamer().EmitAssignment(Alias, Value); getStreamer().emitELFSymverDirective(Alias, Sym); return false; } /// ParseDirectiveVersion /// ::= .version string bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) { if (getLexer().isNot(AsmToken::String)) return TokError("unexpected token in '.version' directive"); StringRef Data = getTok().getIdentifier(); Lex(); MCSection *Note = getContext().getELFSection(".note", ELF::SHT_NOTE, 0); getStreamer().PushSection(); getStreamer().SwitchSection(Note); getStreamer().EmitIntValue(Data.size()+1, 4); // namesz. getStreamer().EmitIntValue(0, 4); // descsz = 0 (no description). getStreamer().EmitIntValue(1, 4); // type = NT_VERSION. getStreamer().EmitBytes(Data); // name. getStreamer().EmitIntValue(0, 1); // terminate the string. getStreamer().EmitValueToAlignment(4); // ensure 4 byte alignment. getStreamer().PopSection(); return false; } /// ParseDirectiveWeakref /// ::= .weakref foo, bar bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) { // FIXME: Share code with the other alias building directives. StringRef AliasName; if (getParser().parseIdentifier(AliasName)) return TokError("expected identifier in directive"); if (getLexer().isNot(AsmToken::Comma)) return TokError("expected a comma"); Lex(); StringRef Name; if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); MCSymbol *Alias = getContext().getOrCreateSymbol(AliasName); MCSymbol *Sym = getContext().getOrCreateSymbol(Name); getStreamer().EmitWeakReference(Alias, Sym); return false; } bool ELFAsmParser::ParseDirectiveSubsection(StringRef, SMLoc) { const MCExpr *Subsection = nullptr; if (getLexer().isNot(AsmToken::EndOfStatement)) { if (getParser().parseExpression(Subsection)) return true; } if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in directive"); Lex(); getStreamer().SubSection(Subsection); return false; } namespace llvm { MCAsmParserExtension *createELFAsmParser() { return new ELFAsmParser; } } // end namespace llvm Index: vendor/llvm/dist/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp =================================================================== --- vendor/llvm/dist/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp (revision 327151) +++ vendor/llvm/dist/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp (revision 327152) @@ -1,2116 +1,2124 @@ //===-- HexagonISelDAGToDAGHVX.cpp ----------------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "Hexagon.h" #include "HexagonISelDAGToDAG.h" #include "HexagonISelLowering.h" #include "HexagonTargetMachine.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include #include #include #include #include #define DEBUG_TYPE "hexagon-isel" using namespace llvm; +namespace { + // -------------------------------------------------------------------- // Implementation of permutation networks. // Implementation of the node routing through butterfly networks: // - Forward delta. // - Reverse delta. // - Benes. // // // Forward delta network consists of log(N) steps, where N is the number // of inputs. In each step, an input can stay in place, or it can get // routed to another position[1]. The step after that consists of two // networks, each half in size in terms of the number of nodes. In those // terms, in the given step, an input can go to either the upper or the // lower network in the next step. // // [1] Hexagon's vdelta/vrdelta allow an element to be routed to both // positions as long as there is no conflict. // Here's a delta network for 8 inputs, only the switching routes are // shown: // // Steps: // |- 1 ---------------|- 2 -----|- 3 -| // // Inp[0] *** *** *** *** Out[0] // \ / \ / \ / // \ / \ / X // \ / \ / / \ // Inp[1] *** \ / *** X *** *** Out[1] // \ \ / / \ / \ / // \ \ / / X X // \ \ / / / \ / \ // Inp[2] *** \ \ / / *** X *** *** Out[2] // \ \ X / / / \ \ / // \ \ / \ / / / \ X // \ X X / / \ / \ // Inp[3] *** \ / \ / \ / *** *** *** Out[3] // \ X X X / // \ / \ / \ / \ / // X X X X // / \ / \ / \ / \ // / X X X \ // Inp[4] *** / \ / \ / \ *** *** *** Out[4] // / X X \ \ / \ / // / / \ / \ \ \ / X // / / X \ \ \ / / \ // Inp[5] *** / / \ \ *** X *** *** Out[5] // / / \ \ \ / \ / // / / \ \ X X // / / \ \ / \ / \ // Inp[6] *** / \ *** X *** *** Out[6] // / \ / \ \ / // / \ / \ X // / \ / \ / \ // Inp[7] *** *** *** *** Out[7] // // // Reverse delta network is same as delta network, with the steps in // the opposite order. // // // Benes network is a forward delta network immediately followed by // a reverse delta network. // Graph coloring utility used to partition nodes into two groups: // they will correspond to nodes routed to the upper and lower networks. struct Coloring { enum : uint8_t { None = 0, Red, Black }; using Node = int; using MapType = std::map; static constexpr Node Ignore = Node(-1); Coloring(ArrayRef Ord) : Order(Ord) { build(); if (!color()) Colors.clear(); } const MapType &colors() const { return Colors; } uint8_t other(uint8_t Color) { if (Color == None) return Red; return Color == Red ? Black : Red; } void dump() const; private: ArrayRef Order; MapType Colors; std::set Needed; using NodeSet = std::set; std::map Edges; Node conj(Node Pos) { Node Num = Order.size(); return (Pos < Num/2) ? Pos + Num/2 : Pos - Num/2; } uint8_t getColor(Node N) { auto F = Colors.find(N); return F != Colors.end() ? F->second : (uint8_t)None; } std::pair getUniqueColor(const NodeSet &Nodes); void build(); bool color(); }; +} // namespace std::pair Coloring::getUniqueColor(const NodeSet &Nodes) { uint8_t Color = None; for (Node N : Nodes) { uint8_t ColorN = getColor(N); if (ColorN == None) continue; if (Color == None) Color = ColorN; else if (Color != None && Color != ColorN) return { false, None }; } return { true, Color }; } void Coloring::build() { // Add Order[P] and Order[conj(P)] to Edges. for (unsigned P = 0; P != Order.size(); ++P) { Node I = Order[P]; if (I != Ignore) { Needed.insert(I); Node PC = Order[conj(P)]; if (PC != Ignore && PC != I) Edges[I].insert(PC); } } // Add I and conj(I) to Edges. for (unsigned I = 0; I != Order.size(); ++I) { if (!Needed.count(I)) continue; Node C = conj(I); // This will create an entry in the edge table, even if I is not // connected to any other node. This is necessary, because it still // needs to be colored. NodeSet &Is = Edges[I]; if (Needed.count(C)) Is.insert(C); } } bool Coloring::color() { SetVector FirstQ; auto Enqueue = [this,&FirstQ] (Node N) { SetVector Q; Q.insert(N); for (unsigned I = 0; I != Q.size(); ++I) { NodeSet &Ns = Edges[Q[I]]; Q.insert(Ns.begin(), Ns.end()); } FirstQ.insert(Q.begin(), Q.end()); }; for (Node N : Needed) Enqueue(N); for (Node N : FirstQ) { if (Colors.count(N)) continue; NodeSet &Ns = Edges[N]; auto P = getUniqueColor(Ns); if (!P.first) return false; Colors[N] = other(P.second); } // First, color nodes that don't have any dups. for (auto E : Edges) { Node N = E.first; if (!Needed.count(conj(N)) || Colors.count(N)) continue; auto P = getUniqueColor(E.second); if (!P.first) return false; Colors[N] = other(P.second); } // Now, nodes that are still uncolored. Since the graph can be modified // in this step, create a work queue. std::vector WorkQ; for (auto E : Edges) { Node N = E.first; if (!Colors.count(N)) WorkQ.push_back(N); } for (unsigned I = 0; I < WorkQ.size(); ++I) { Node N = WorkQ[I]; NodeSet &Ns = Edges[N]; auto P = getUniqueColor(Ns); if (P.first) { Colors[N] = other(P.second); continue; } // Coloring failed. Split this node. Node C = conj(N); uint8_t ColorN = other(None); uint8_t ColorC = other(ColorN); NodeSet &Cs = Edges[C]; NodeSet CopyNs = Ns; for (Node M : CopyNs) { uint8_t ColorM = getColor(M); if (ColorM == ColorC) { // Connect M with C, disconnect M from N. Cs.insert(M); Edges[M].insert(C); Ns.erase(M); Edges[M].erase(N); } } Colors[N] = ColorN; Colors[C] = ColorC; } // Explicitly assign "None" all all uncolored nodes. for (unsigned I = 0; I != Order.size(); ++I) if (Colors.count(I) == 0) Colors[I] = None; return true; } LLVM_DUMP_METHOD void Coloring::dump() const { dbgs() << "{ Order: {"; for (unsigned I = 0; I != Order.size(); ++I) { Node P = Order[I]; if (P != Ignore) dbgs() << ' ' << P; else dbgs() << " -"; } dbgs() << " }\n"; dbgs() << " Needed: {"; for (Node N : Needed) dbgs() << ' ' << N; dbgs() << " }\n"; dbgs() << " Edges: {\n"; for (auto E : Edges) { dbgs() << " " << E.first << " -> {"; for (auto N : E.second) dbgs() << ' ' << N; dbgs() << " }\n"; } dbgs() << " }\n"; static const char *const Names[] = { "None", "Red", "Black" }; dbgs() << " Colors: {\n"; for (auto C : Colors) dbgs() << " " << C.first << " -> " << Names[C.second] << "\n"; dbgs() << " }\n}\n"; } +namespace { // Base class of for reordering networks. They don't strictly need to be // permutations, as outputs with repeated occurrences of an input element // are allowed. struct PermNetwork { using Controls = std::vector; using ElemType = int; static constexpr ElemType Ignore = ElemType(-1); enum : uint8_t { None, Pass, Switch }; enum : uint8_t { Forward, Reverse }; PermNetwork(ArrayRef Ord, unsigned Mult = 1) { Order.assign(Ord.data(), Ord.data()+Ord.size()); Log = 0; unsigned S = Order.size(); while (S >>= 1) ++Log; Table.resize(Order.size()); for (RowType &Row : Table) Row.resize(Mult*Log, None); } void getControls(Controls &V, unsigned StartAt, uint8_t Dir) const { unsigned Size = Order.size(); V.resize(Size); for (unsigned I = 0; I != Size; ++I) { unsigned W = 0; for (unsigned L = 0; L != Log; ++L) { unsigned C = ctl(I, StartAt+L) == Switch; if (Dir == Forward) W |= C << (Log-1-L); else W |= C << L; } assert(isUInt<8>(W)); V[I] = uint8_t(W); } } uint8_t ctl(ElemType Pos, unsigned Step) const { return Table[Pos][Step]; } unsigned size() const { return Order.size(); } unsigned steps() const { return Log; } protected: unsigned Log; std::vector Order; using RowType = std::vector; std::vector Table; }; struct ForwardDeltaNetwork : public PermNetwork { ForwardDeltaNetwork(ArrayRef Ord) : PermNetwork(Ord) {} bool run(Controls &V) { if (!route(Order.data(), Table.data(), size(), 0)) return false; getControls(V, 0, Forward); return true; } private: bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step); }; struct ReverseDeltaNetwork : public PermNetwork { ReverseDeltaNetwork(ArrayRef Ord) : PermNetwork(Ord) {} bool run(Controls &V) { if (!route(Order.data(), Table.data(), size(), 0)) return false; getControls(V, 0, Reverse); return true; } private: bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step); }; struct BenesNetwork : public PermNetwork { BenesNetwork(ArrayRef Ord) : PermNetwork(Ord, 2) {} bool run(Controls &F, Controls &R) { if (!route(Order.data(), Table.data(), size(), 0)) return false; getControls(F, 0, Forward); getControls(R, Log, Reverse); return true; } private: bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step); }; +} // namespace - bool ForwardDeltaNetwork::route(ElemType *P, RowType *T, unsigned Size, unsigned Step) { bool UseUp = false, UseDown = false; ElemType Num = Size; // Cannot use coloring here, because coloring is used to determine // the "big" switch, i.e. the one that changes halves, and in a forward // network, a color can be simultaneously routed to both halves in the // step we're working on. for (ElemType J = 0; J != Num; ++J) { ElemType I = P[J]; // I is the position in the input, // J is the position in the output. if (I == Ignore) continue; uint8_t S; if (I < Num/2) S = (J < Num/2) ? Pass : Switch; else S = (J < Num/2) ? Switch : Pass; // U is the element in the table that needs to be updated. ElemType U = (S == Pass) ? I : (I < Num/2 ? I+Num/2 : I-Num/2); if (U < Num/2) UseUp = true; else UseDown = true; if (T[U][Step] != S && T[U][Step] != None) return false; T[U][Step] = S; } for (ElemType J = 0; J != Num; ++J) if (P[J] != Ignore && P[J] >= Num/2) P[J] -= Num/2; if (Step+1 < Log) { if (UseUp && !route(P, T, Size/2, Step+1)) return false; if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1)) return false; } return true; } bool ReverseDeltaNetwork::route(ElemType *P, RowType *T, unsigned Size, unsigned Step) { unsigned Pets = Log-1 - Step; bool UseUp = false, UseDown = false; ElemType Num = Size; // In this step half-switching occurs, so coloring can be used. Coloring G({P,Size}); const Coloring::MapType &M = G.colors(); if (M.empty()) return false; uint8_t ColorUp = Coloring::None; for (ElemType J = 0; J != Num; ++J) { ElemType I = P[J]; // I is the position in the input, // J is the position in the output. if (I == Ignore) continue; uint8_t C = M.at(I); if (C == Coloring::None) continue; // During "Step", inputs cannot switch halves, so if the "up" color // is still unknown, make sure that it is selected in such a way that // "I" will stay in the same half. bool InpUp = I < Num/2; if (ColorUp == Coloring::None) ColorUp = InpUp ? C : G.other(C); if ((C == ColorUp) != InpUp) { // If I should go to a different half than where is it now, give up. return false; } uint8_t S; if (InpUp) { S = (J < Num/2) ? Pass : Switch; UseUp = true; } else { S = (J < Num/2) ? Switch : Pass; UseDown = true; } T[J][Pets] = S; } // Reorder the working permutation according to the computed switch table // for the last step (i.e. Pets). for (ElemType J = 0, E = Size / 2; J != E; ++J) { ElemType PJ = P[J]; // Current values of P[J] ElemType PC = P[J+Size/2]; // and P[conj(J)] ElemType QJ = PJ; // New values of P[J] ElemType QC = PC; // and P[conj(J)] if (T[J][Pets] == Switch) QC = PJ; if (T[J+Size/2][Pets] == Switch) QJ = PC; P[J] = QJ; P[J+Size/2] = QC; } for (ElemType J = 0; J != Num; ++J) if (P[J] != Ignore && P[J] >= Num/2) P[J] -= Num/2; if (Step+1 < Log) { if (UseUp && !route(P, T, Size/2, Step+1)) return false; if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1)) return false; } return true; } bool BenesNetwork::route(ElemType *P, RowType *T, unsigned Size, unsigned Step) { Coloring G({P,Size}); const Coloring::MapType &M = G.colors(); if (M.empty()) return false; ElemType Num = Size; unsigned Pets = 2*Log-1 - Step; bool UseUp = false, UseDown = false; // Both assignments, i.e. Red->Up and Red->Down are valid, but they will // result in different controls. Let's pick the one where the first // control will be "Pass". uint8_t ColorUp = Coloring::None; for (ElemType J = 0; J != Num; ++J) { ElemType I = P[J]; if (I == Ignore) continue; uint8_t C = M.at(I); if (C == Coloring::None) continue; if (ColorUp == Coloring::None) { ColorUp = (I < Num/2) ? Coloring::Red : Coloring::Black; } unsigned CI = (I < Num/2) ? I+Num/2 : I-Num/2; if (C == ColorUp) { if (I < Num/2) T[I][Step] = Pass; else T[CI][Step] = Switch; T[J][Pets] = (J < Num/2) ? Pass : Switch; UseUp = true; } else { // Down if (I < Num/2) T[CI][Step] = Switch; else T[I][Step] = Pass; T[J][Pets] = (J < Num/2) ? Switch : Pass; UseDown = true; } } // Reorder the working permutation according to the computed switch table // for the last step (i.e. Pets). for (ElemType J = 0; J != Num/2; ++J) { ElemType PJ = P[J]; // Current values of P[J] ElemType PC = P[J+Num/2]; // and P[conj(J)] ElemType QJ = PJ; // New values of P[J] ElemType QC = PC; // and P[conj(J)] if (T[J][Pets] == Switch) QC = PJ; if (T[J+Num/2][Pets] == Switch) QJ = PC; P[J] = QJ; P[J+Num/2] = QC; } for (ElemType J = 0; J != Num; ++J) if (P[J] != Ignore && P[J] >= Num/2) P[J] -= Num/2; if (Step+1 < Log) { if (UseUp && !route(P, T, Size/2, Step+1)) return false; if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1)) return false; } return true; } // -------------------------------------------------------------------- // Support for building selection results (output instructions that are // parts of the final selection). +namespace { struct OpRef { OpRef(SDValue V) : OpV(V) {} bool isValue() const { return OpV.getNode() != nullptr; } bool isValid() const { return isValue() || !(OpN & Invalid); } static OpRef res(int N) { return OpRef(Whole | (N & Index)); } static OpRef fail() { return OpRef(Invalid); } static OpRef lo(const OpRef &R) { assert(!R.isValue()); return OpRef(R.OpN & (Undef | Index | LoHalf)); } static OpRef hi(const OpRef &R) { assert(!R.isValue()); return OpRef(R.OpN & (Undef | Index | HiHalf)); } static OpRef undef(MVT Ty) { return OpRef(Undef | Ty.SimpleTy); } // Direct value. SDValue OpV = SDValue(); // Reference to the operand of the input node: // If the 31st bit is 1, it's undef, otherwise, bits 28..0 are the // operand index: // If bit 30 is set, it's the high half of the operand. // If bit 29 is set, it's the low half of the operand. unsigned OpN = 0; enum : unsigned { Invalid = 0x10000000, LoHalf = 0x20000000, HiHalf = 0x40000000, Whole = LoHalf | HiHalf, Undef = 0x80000000, Index = 0x0FFFFFFF, // Mask of the index value. IndexBits = 28, }; void print(raw_ostream &OS, const SelectionDAG &G) const; private: OpRef(unsigned N) : OpN(N) {} }; struct NodeTemplate { NodeTemplate() = default; unsigned Opc = 0; MVT Ty = MVT::Other; std::vector Ops; void print(raw_ostream &OS, const SelectionDAG &G) const; }; struct ResultStack { ResultStack(SDNode *Inp) : InpNode(Inp), InpTy(Inp->getValueType(0).getSimpleVT()) {} SDNode *InpNode; MVT InpTy; unsigned push(const NodeTemplate &Res) { List.push_back(Res); return List.size()-1; } unsigned push(unsigned Opc, MVT Ty, std::vector &&Ops) { NodeTemplate Res; Res.Opc = Opc; Res.Ty = Ty; Res.Ops = Ops; return push(Res); } bool empty() const { return List.empty(); } unsigned size() const { return List.size(); } unsigned top() const { return size()-1; } const NodeTemplate &operator[](unsigned I) const { return List[I]; } unsigned reset(unsigned NewTop) { List.resize(NewTop+1); return NewTop; } using BaseType = std::vector; BaseType::iterator begin() { return List.begin(); } BaseType::iterator end() { return List.end(); } BaseType::const_iterator begin() const { return List.begin(); } BaseType::const_iterator end() const { return List.end(); } BaseType List; void print(raw_ostream &OS, const SelectionDAG &G) const; }; +} // namespace void OpRef::print(raw_ostream &OS, const SelectionDAG &G) const { if (isValue()) { OpV.getNode()->print(OS, &G); return; } if (OpN & Invalid) { OS << "invalid"; return; } if (OpN & Undef) { OS << "undef"; return; } if ((OpN & Whole) != Whole) { assert((OpN & Whole) == LoHalf || (OpN & Whole) == HiHalf); if (OpN & LoHalf) OS << "lo "; else OS << "hi "; } OS << '#' << SignExtend32(OpN & Index, IndexBits); } void NodeTemplate::print(raw_ostream &OS, const SelectionDAG &G) const { const TargetInstrInfo &TII = *G.getSubtarget().getInstrInfo(); OS << format("%8s", EVT(Ty).getEVTString().c_str()) << " " << TII.getName(Opc); bool Comma = false; for (const auto &R : Ops) { if (Comma) OS << ','; Comma = true; OS << ' '; R.print(OS, G); } } void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const { OS << "Input node:\n"; #ifndef NDEBUG InpNode->dumpr(&G); #endif OS << "Result templates:\n"; for (unsigned I = 0, E = List.size(); I != E; ++I) { OS << '[' << I << "] "; List[I].print(OS, G); OS << '\n'; } } +namespace { struct ShuffleMask { ShuffleMask(ArrayRef M) : Mask(M) { for (unsigned I = 0, E = Mask.size(); I != E; ++I) { int M = Mask[I]; if (M == -1) continue; MinSrc = (MinSrc == -1) ? M : std::min(MinSrc, M); MaxSrc = (MaxSrc == -1) ? M : std::max(MaxSrc, M); } } ArrayRef Mask; int MinSrc = -1, MaxSrc = -1; ShuffleMask lo() const { size_t H = Mask.size()/2; return ShuffleMask(Mask.take_front(H)); } ShuffleMask hi() const { size_t H = Mask.size()/2; return ShuffleMask(Mask.take_back(H)); } }; +} // namespace // -------------------------------------------------------------------- // The HvxSelector class. static const HexagonTargetLowering &getHexagonLowering(SelectionDAG &G) { return static_cast(G.getTargetLoweringInfo()); } static const HexagonSubtarget &getHexagonSubtarget(SelectionDAG &G) { return static_cast(G.getSubtarget()); } namespace llvm { struct HvxSelector { const HexagonTargetLowering &Lower; HexagonDAGToDAGISel &ISel; SelectionDAG &DAG; const HexagonSubtarget &HST; const unsigned HwLen; HvxSelector(HexagonDAGToDAGISel &HS, SelectionDAG &G) : Lower(getHexagonLowering(G)), ISel(HS), DAG(G), HST(getHexagonSubtarget(G)), HwLen(HST.getVectorLength()) {} MVT getSingleVT(MVT ElemTy) const { unsigned NumElems = HwLen / (ElemTy.getSizeInBits()/8); return MVT::getVectorVT(ElemTy, NumElems); } MVT getPairVT(MVT ElemTy) const { unsigned NumElems = (2*HwLen) / (ElemTy.getSizeInBits()/8); return MVT::getVectorVT(ElemTy, NumElems); } void selectShuffle(SDNode *N); void selectRor(SDNode *N); private: void materialize(const ResultStack &Results); SDValue getVectorConstant(ArrayRef Data, const SDLoc &dl); enum : unsigned { None, PackMux, }; OpRef concat(OpRef Va, OpRef Vb, ResultStack &Results); OpRef packs(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results, MutableArrayRef NewMask, unsigned Options = None); OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results, MutableArrayRef NewMask); OpRef zerous(ShuffleMask SM, OpRef Va, ResultStack &Results); OpRef vmuxs(ArrayRef Bytes, OpRef Va, OpRef Vb, ResultStack &Results); OpRef vmuxp(ArrayRef Bytes, OpRef Va, OpRef Vb, ResultStack &Results); OpRef shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results); OpRef shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results); OpRef shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results); OpRef shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results); OpRef butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results); OpRef contracting(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results); OpRef expanding(ShuffleMask SM, OpRef Va, ResultStack &Results); OpRef perfect(ShuffleMask SM, OpRef Va, ResultStack &Results); bool selectVectorConstants(SDNode *N); bool scalarizeShuffle(ArrayRef Mask, const SDLoc &dl, MVT ResTy, SDValue Va, SDValue Vb, SDNode *N); }; } static void splitMask(ArrayRef Mask, MutableArrayRef MaskL, MutableArrayRef MaskR) { unsigned VecLen = Mask.size(); assert(MaskL.size() == VecLen && MaskR.size() == VecLen); for (unsigned I = 0; I != VecLen; ++I) { int M = Mask[I]; if (M < 0) { MaskL[I] = MaskR[I] = -1; } else if (unsigned(M) < VecLen) { MaskL[I] = M; MaskR[I] = -1; } else { MaskL[I] = -1; MaskR[I] = M-VecLen; } } } static std::pair findStrip(ArrayRef A, int Inc, unsigned MaxLen) { assert(A.size() > 0 && A.size() >= MaxLen); int F = A[0]; int E = F; for (unsigned I = 1; I != MaxLen; ++I) { if (A[I] - E != Inc) return { F, I }; E = A[I]; } return { F, MaxLen }; } static bool isUndef(ArrayRef Mask) { for (int Idx : Mask) if (Idx != -1) return false; return true; } static bool isIdentity(ArrayRef Mask) { for (int I = 0, E = Mask.size(); I != E; ++I) { int M = Mask[I]; if (M >= 0 && M != I) return false; } return true; } static bool isPermutation(ArrayRef Mask) { // Check by adding all numbers only works if there is no overflow. assert(Mask.size() < 0x00007FFF && "Sanity failure"); int Sum = 0; for (int Idx : Mask) { if (Idx == -1) return false; Sum += Idx; } int N = Mask.size(); return 2*Sum == N*(N-1); } bool HvxSelector::selectVectorConstants(SDNode *N) { // Constant vectors are generated as loads from constant pools. // Since they are generated during the selection process, the main // selection algorithm is not aware of them. Select them directly // here. SmallVector Loads; SmallVector WorkQ; // The DAG can change (due to CSE) during selection, so cache all the // unselected nodes first to avoid traversing a mutating DAG. auto IsLoadToSelect = [] (SDNode *N) { if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) { SDValue Addr = cast(N)->getBasePtr(); unsigned AddrOpc = Addr.getOpcode(); if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP) if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool) return true; } return false; }; WorkQ.push_back(N); for (unsigned i = 0; i != WorkQ.size(); ++i) { SDNode *W = WorkQ[i]; if (IsLoadToSelect(W)) { Loads.push_back(W); continue; } for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j) WorkQ.push_back(W->getOperand(j).getNode()); } for (SDNode *L : Loads) ISel.Select(L); return !Loads.empty(); } void HvxSelector::materialize(const ResultStack &Results) { DEBUG_WITH_TYPE("isel", { dbgs() << "Materializing\n"; Results.print(dbgs(), DAG); }); if (Results.empty()) return; const SDLoc &dl(Results.InpNode); std::vector Output; for (unsigned I = 0, E = Results.size(); I != E; ++I) { const NodeTemplate &Node = Results[I]; std::vector Ops; for (const OpRef &R : Node.Ops) { assert(R.isValid()); if (R.isValue()) { Ops.push_back(R.OpV); continue; } if (R.OpN & OpRef::Undef) { MVT::SimpleValueType SVT = MVT::SimpleValueType(R.OpN & OpRef::Index); Ops.push_back(ISel.selectUndef(dl, MVT(SVT))); continue; } // R is an index of a result. unsigned Part = R.OpN & OpRef::Whole; int Idx = SignExtend32(R.OpN & OpRef::Index, OpRef::IndexBits); if (Idx < 0) Idx += I; assert(Idx >= 0 && unsigned(Idx) < Output.size()); SDValue Op = Output[Idx]; MVT OpTy = Op.getValueType().getSimpleVT(); if (Part != OpRef::Whole) { assert(Part == OpRef::LoHalf || Part == OpRef::HiHalf); if (Op.getOpcode() == HexagonISD::VCOMBINE) { Op = (Part == OpRef::HiHalf) ? Op.getOperand(0) : Op.getOperand(1); } else { MVT HalfTy = MVT::getVectorVT(OpTy.getVectorElementType(), OpTy.getVectorNumElements()/2); unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo : Hexagon::vsub_hi; Op = DAG.getTargetExtractSubreg(Sub, dl, HalfTy, Op); } } Ops.push_back(Op); } // for (Node : Results) assert(Node.Ty != MVT::Other); SDNode *ResN = (Node.Opc == TargetOpcode::COPY) ? Ops.front().getNode() : DAG.getMachineNode(Node.Opc, dl, Node.Ty, Ops); Output.push_back(SDValue(ResN, 0)); } SDNode *OutN = Output.back().getNode(); SDNode *InpN = Results.InpNode; DEBUG_WITH_TYPE("isel", { dbgs() << "Generated node:\n"; OutN->dumpr(&DAG); }); ISel.ReplaceNode(InpN, OutN); selectVectorConstants(OutN); DAG.RemoveDeadNodes(); } OpRef HvxSelector::concat(OpRef Lo, OpRef Hi, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); const SDLoc &dl(Results.InpNode); Results.push(TargetOpcode::REG_SEQUENCE, getPairVT(MVT::i8), { DAG.getTargetConstant(Hexagon::HvxWRRegClassID, dl, MVT::i32), Lo, DAG.getTargetConstant(Hexagon::vsub_lo, dl, MVT::i32), Hi, DAG.getTargetConstant(Hexagon::vsub_hi, dl, MVT::i32), }); return OpRef::res(Results.top()); } // Va, Vb are single vectors, SM can be arbitrarily long. OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results, MutableArrayRef NewMask, unsigned Options) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); if (!Va.isValid() || !Vb.isValid()) return OpRef::fail(); int VecLen = SM.Mask.size(); MVT Ty = getSingleVT(MVT::i8); if (SM.MaxSrc - SM.MinSrc < int(HwLen)) { if (SM.MaxSrc < int(HwLen)) { memcpy(NewMask.data(), SM.Mask.data(), sizeof(int)*VecLen); return Va; } if (SM.MinSrc >= int(HwLen)) { for (int I = 0; I != VecLen; ++I) { int M = SM.Mask[I]; if (M != -1) M -= HwLen; NewMask[I] = M; } return Vb; } const SDLoc &dl(Results.InpNode); SDValue S = DAG.getTargetConstant(SM.MinSrc, dl, MVT::i32); if (isUInt<3>(SM.MinSrc)) { Results.push(Hexagon::V6_valignbi, Ty, {Vb, Va, S}); } else { Results.push(Hexagon::A2_tfrsi, MVT::i32, {S}); unsigned Top = Results.top(); Results.push(Hexagon::V6_valignb, Ty, {Vb, Va, OpRef::res(Top)}); } for (int I = 0; I != VecLen; ++I) { int M = SM.Mask[I]; if (M != -1) M -= SM.MinSrc; NewMask[I] = M; } return OpRef::res(Results.top()); } if (Options & PackMux) { // If elements picked from Va and Vb have all different (source) indexes // (relative to the start of the argument), do a mux, and update the mask. BitVector Picked(HwLen); SmallVector MuxBytes(HwLen); bool CanMux = true; for (int I = 0; I != VecLen; ++I) { int M = SM.Mask[I]; if (M == -1) continue; if (M >= int(HwLen)) M -= HwLen; else MuxBytes[M] = 0xFF; if (Picked[M]) { CanMux = false; break; } NewMask[I] = M; } if (CanMux) return vmuxs(MuxBytes, Va, Vb, Results); } return OpRef::fail(); } OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results, MutableArrayRef NewMask) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); unsigned HalfMask = 0; unsigned LogHw = Log2_32(HwLen); for (int M : SM.Mask) { if (M == -1) continue; HalfMask |= (1u << (M >> LogHw)); } if (HalfMask == 0) return OpRef::undef(getPairVT(MVT::i8)); // If more than two halves are used, bail. // TODO: be more aggressive here? if (countPopulation(HalfMask) > 2) return OpRef::fail(); MVT HalfTy = getSingleVT(MVT::i8); OpRef Inp[2] = { Va, Vb }; OpRef Out[2] = { OpRef::undef(HalfTy), OpRef::undef(HalfTy) }; uint8_t HalfIdx[4] = { 0xFF, 0xFF, 0xFF, 0xFF }; unsigned Idx = 0; for (unsigned I = 0; I != 4; ++I) { if ((HalfMask & (1u << I)) == 0) continue; assert(Idx < 2); OpRef Op = Inp[I/2]; Out[Idx] = (I & 1) ? OpRef::hi(Op) : OpRef::lo(Op); HalfIdx[I] = Idx++; } int VecLen = SM.Mask.size(); for (int I = 0; I != VecLen; ++I) { int M = SM.Mask[I]; if (M >= 0) { uint8_t Idx = HalfIdx[M >> LogHw]; assert(Idx == 0 || Idx == 1); M = (M & (HwLen-1)) + HwLen*Idx; } NewMask[I] = M; } return concat(Out[0], Out[1], Results); } OpRef HvxSelector::zerous(ShuffleMask SM, OpRef Va, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); int VecLen = SM.Mask.size(); SmallVector UsedBytes(VecLen); bool HasUnused = false; for (int I = 0; I != VecLen; ++I) { if (SM.Mask[I] != -1) UsedBytes[I] = 0xFF; else HasUnused = true; } if (!HasUnused) return Va; SDValue B = getVectorConstant(UsedBytes, SDLoc(Results.InpNode)); Results.push(Hexagon::V6_vand, getSingleVT(MVT::i8), {Va, OpRef(B)}); return OpRef::res(Results.top()); } OpRef HvxSelector::vmuxs(ArrayRef Bytes, OpRef Va, OpRef Vb, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); MVT ByteTy = getSingleVT(MVT::i8); MVT BoolTy = MVT::getVectorVT(MVT::i1, 8*HwLen); // XXX const SDLoc &dl(Results.InpNode); SDValue B = getVectorConstant(Bytes, dl); Results.push(Hexagon::V6_vd0, ByteTy, {}); Results.push(Hexagon::V6_veqb, BoolTy, {OpRef(B), OpRef::res(-1)}); Results.push(Hexagon::V6_vmux, ByteTy, {OpRef::res(-1), Vb, Va}); return OpRef::res(Results.top()); } OpRef HvxSelector::vmuxp(ArrayRef Bytes, OpRef Va, OpRef Vb, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); size_t S = Bytes.size() / 2; OpRef L = vmuxs(Bytes.take_front(S), OpRef::lo(Va), OpRef::lo(Vb), Results); OpRef H = vmuxs(Bytes.drop_front(S), OpRef::hi(Va), OpRef::hi(Vb), Results); return concat(L, H, Results); } OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); unsigned VecLen = SM.Mask.size(); assert(HwLen == VecLen); (void)VecLen; assert(all_of(SM.Mask, [this](int M) { return M == -1 || M < int(HwLen); })); if (isIdentity(SM.Mask)) return Va; if (isUndef(SM.Mask)) return OpRef::undef(getSingleVT(MVT::i8)); OpRef P = perfect(SM, Va, Results); if (P.isValid()) return P; return butterfly(SM, Va, Results); } OpRef HvxSelector::shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); if (isUndef(SM.Mask)) return OpRef::undef(getSingleVT(MVT::i8)); OpRef C = contracting(SM, Va, Vb, Results); if (C.isValid()) return C; int VecLen = SM.Mask.size(); SmallVector NewMask(VecLen); OpRef P = packs(SM, Va, Vb, Results, NewMask); if (P.isValid()) return shuffs1(ShuffleMask(NewMask), P, Results); SmallVector MaskL(VecLen), MaskR(VecLen); splitMask(SM.Mask, MaskL, MaskR); OpRef L = shuffs1(ShuffleMask(MaskL), Va, Results); OpRef R = shuffs1(ShuffleMask(MaskR), Vb, Results); if (!L.isValid() || !R.isValid()) return OpRef::fail(); SmallVector Bytes(VecLen); for (int I = 0; I != VecLen; ++I) { if (MaskL[I] != -1) Bytes[I] = 0xFF; } return vmuxs(Bytes, L, R, Results); } OpRef HvxSelector::shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); int VecLen = SM.Mask.size(); if (isIdentity(SM.Mask)) return Va; if (isUndef(SM.Mask)) return OpRef::undef(getPairVT(MVT::i8)); SmallVector PackedMask(VecLen); OpRef P = packs(SM, OpRef::lo(Va), OpRef::hi(Va), Results, PackedMask); if (P.isValid()) { ShuffleMask PM(PackedMask); OpRef E = expanding(PM, P, Results); if (E.isValid()) return E; OpRef L = shuffs1(PM.lo(), P, Results); OpRef H = shuffs1(PM.hi(), P, Results); if (L.isValid() && H.isValid()) return concat(L, H, Results); } OpRef R = perfect(SM, Va, Results); if (R.isValid()) return R; // TODO commute the mask and try the opposite order of the halves. OpRef L = shuffs2(SM.lo(), OpRef::lo(Va), OpRef::hi(Va), Results); OpRef H = shuffs2(SM.hi(), OpRef::lo(Va), OpRef::hi(Va), Results); if (L.isValid() && H.isValid()) return concat(L, H, Results); return OpRef::fail(); } OpRef HvxSelector::shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); if (isUndef(SM.Mask)) return OpRef::undef(getPairVT(MVT::i8)); int VecLen = SM.Mask.size(); SmallVector PackedMask(VecLen); OpRef P = packp(SM, Va, Vb, Results, PackedMask); if (P.isValid()) return shuffp1(ShuffleMask(PackedMask), P, Results); SmallVector MaskL(VecLen), MaskR(VecLen); OpRef L = shuffp1(ShuffleMask(MaskL), Va, Results); OpRef R = shuffp1(ShuffleMask(MaskR), Vb, Results); if (!L.isValid() || !R.isValid()) return OpRef::fail(); // Mux the results. SmallVector Bytes(VecLen); for (int I = 0; I != VecLen; ++I) { if (MaskL[I] != -1) Bytes[I] = 0xFF; } return vmuxp(Bytes, L, R, Results); } bool HvxSelector::scalarizeShuffle(ArrayRef Mask, const SDLoc &dl, MVT ResTy, SDValue Va, SDValue Vb, SDNode *N) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); MVT ElemTy = ResTy.getVectorElementType(); assert(ElemTy == MVT::i8); unsigned VecLen = Mask.size(); bool HavePairs = (2*HwLen == VecLen); MVT SingleTy = getSingleVT(MVT::i8); SmallVector Ops; for (int I : Mask) { if (I < 0) { Ops.push_back(ISel.selectUndef(dl, ElemTy)); continue; } SDValue Vec; unsigned M = I; if (M < VecLen) { Vec = Va; } else { Vec = Vb; M -= VecLen; } if (HavePairs) { if (M < HwLen) { Vec = DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, SingleTy, Vec); } else { Vec = DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, SingleTy, Vec); M -= HwLen; } } SDValue Idx = DAG.getConstant(M, dl, MVT::i32); SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemTy, {Vec, Idx}); SDValue L = Lower.LowerOperation(Ex, DAG); assert(L.getNode()); Ops.push_back(L); } SDValue LV; if (2*HwLen == VecLen) { SDValue B0 = DAG.getBuildVector(SingleTy, dl, {Ops.data(), HwLen}); SDValue L0 = Lower.LowerOperation(B0, DAG); SDValue B1 = DAG.getBuildVector(SingleTy, dl, {Ops.data()+HwLen, HwLen}); SDValue L1 = Lower.LowerOperation(B1, DAG); // XXX CONCAT_VECTORS is legal for HVX vectors. Legalizing (lowering) // functions may expect to be called only for illegal operations, so // make sure that they are not called for legal ones. Develop a better // mechanism for dealing with this. LV = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResTy, {L0, L1}); } else { SDValue BV = DAG.getBuildVector(ResTy, dl, Ops); LV = Lower.LowerOperation(BV, DAG); } assert(!N->use_empty()); ISel.ReplaceNode(N, LV.getNode()); DAG.RemoveDeadNodes(); std::deque SubNodes; SubNodes.push_back(LV.getNode()); for (unsigned I = 0; I != SubNodes.size(); ++I) { for (SDValue Op : SubNodes[I]->ops()) SubNodes.push_back(Op.getNode()); } while (!SubNodes.empty()) { SDNode *S = SubNodes.front(); SubNodes.pop_front(); if (S->use_empty()) continue; // This isn't great, but users need to be selected before any nodes that // they use. (The reason is to match larger patterns, and avoid nodes that // cannot be matched on their own, e.g. ValueType, TokenFactor, etc.). bool PendingUser = llvm::any_of(S->uses(), [&SubNodes](const SDNode *U) { return llvm::any_of(SubNodes, [U](const SDNode *T) { return T == U; }); }); if (PendingUser) SubNodes.push_back(S); else ISel.Select(S); } DAG.RemoveDeadNodes(); return true; } OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); if (!Va.isValid() || !Vb.isValid()) return OpRef::fail(); // Contracting shuffles, i.e. instructions that always discard some bytes // from the operand vectors. // // V6_vshuff{e,o}b // V6_vdealb4w // V6_vpack{e,o}{b,h} int VecLen = SM.Mask.size(); std::pair Strip = findStrip(SM.Mask, 1, VecLen); MVT ResTy = getSingleVT(MVT::i8); // The following shuffles only work for bytes and halfwords. This requires // the strip length to be 1 or 2. if (Strip.second != 1 && Strip.second != 2) return OpRef::fail(); // The patterns for the shuffles, in terms of the starting offsets of the // consecutive strips (L = length of the strip, N = VecLen): // // vpacke: 0, 2L, 4L ... N+0, N+2L, N+4L ... L = 1 or 2 // vpacko: L, 3L, 5L ... N+L, N+3L, N+5L ... L = 1 or 2 // // vshuffe: 0, N+0, 2L, N+2L, 4L ... L = 1 or 2 // vshuffo: L, N+L, 3L, N+3L, 5L ... L = 1 or 2 // // vdealb4w: 0, 4, 8 ... 2, 6, 10 ... N+0, N+4, N+8 ... N+2, N+6, N+10 ... // The value of the element in the mask following the strip will decide // what kind of a shuffle this can be. int NextInMask = SM.Mask[Strip.second]; // Check if NextInMask could be 2L, 3L or 4, i.e. if it could be a mask // for vpack or vdealb4w. VecLen > 4, so NextInMask for vdealb4w would // satisfy this. if (NextInMask < VecLen) { // vpack{e,o} or vdealb4w if (Strip.first == 0 && Strip.second == 1 && NextInMask == 4) { int N = VecLen; // Check if this is vdealb4w (L=1). for (int I = 0; I != N/4; ++I) if (SM.Mask[I] != 4*I) return OpRef::fail(); for (int I = 0; I != N/4; ++I) if (SM.Mask[I+N/4] != 2 + 4*I) return OpRef::fail(); for (int I = 0; I != N/4; ++I) if (SM.Mask[I+N/2] != N + 4*I) return OpRef::fail(); for (int I = 0; I != N/4; ++I) if (SM.Mask[I+3*N/4] != N+2 + 4*I) return OpRef::fail(); // Matched mask for vdealb4w. Results.push(Hexagon::V6_vdealb4w, ResTy, {Vb, Va}); return OpRef::res(Results.top()); } // Check if this is vpack{e,o}. int N = VecLen; int L = Strip.second; // Check if the first strip starts at 0 or at L. if (Strip.first != 0 && Strip.first != L) return OpRef::fail(); // Examine the rest of the mask. for (int I = L; I < N; I += L) { auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); // Check whether the mask element at the beginning of each strip // increases by 2L each time. if (S.first - Strip.first != 2*I) return OpRef::fail(); // Check whether each strip is of the same length. if (S.second != unsigned(L)) return OpRef::fail(); } // Strip.first == 0 => vpacke // Strip.first == L => vpacko assert(Strip.first == 0 || Strip.first == L); using namespace Hexagon; NodeTemplate Res; Res.Opc = Strip.second == 1 // Number of bytes. ? (Strip.first == 0 ? V6_vpackeb : V6_vpackob) : (Strip.first == 0 ? V6_vpackeh : V6_vpackoh); Res.Ty = ResTy; Res.Ops = { Vb, Va }; Results.push(Res); return OpRef::res(Results.top()); } // Check if this is vshuff{e,o}. int N = VecLen; int L = Strip.second; std::pair PrevS = Strip; bool Flip = false; for (int I = L; I < N; I += L) { auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); if (S.second != PrevS.second) return OpRef::fail(); int Diff = Flip ? PrevS.first - S.first + 2*L : S.first - PrevS.first; if (Diff != N) return OpRef::fail(); Flip ^= true; PrevS = S; } // Strip.first == 0 => vshuffe // Strip.first == L => vshuffo assert(Strip.first == 0 || Strip.first == L); using namespace Hexagon; NodeTemplate Res; Res.Opc = Strip.second == 1 // Number of bytes. ? (Strip.first == 0 ? V6_vshuffeb : V6_vshuffob) : (Strip.first == 0 ? V6_vshufeh : V6_vshufoh); Res.Ty = ResTy; Res.Ops = { Vb, Va }; Results.push(Res); return OpRef::res(Results.top()); } OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); // Expanding shuffles (using all elements and inserting into larger vector): // // V6_vunpacku{b,h} [*] // // [*] Only if the upper elements (filled with 0s) are "don't care" in Mask. // // Note: V6_vunpacko{b,h} are or-ing the high byte/half in the result, so // they are not shuffles. // // The argument is a single vector. int VecLen = SM.Mask.size(); assert(2*HwLen == unsigned(VecLen) && "Expecting vector-pair type"); std::pair Strip = findStrip(SM.Mask, 1, VecLen); // The patterns for the unpacks, in terms of the starting offsets of the // consecutive strips (L = length of the strip, N = VecLen): // // vunpacku: 0, -1, L, -1, 2L, -1 ... if (Strip.first != 0) return OpRef::fail(); // The vunpackus only handle byte and half-word. if (Strip.second != 1 && Strip.second != 2) return OpRef::fail(); int N = VecLen; int L = Strip.second; // First, check the non-ignored strips. for (int I = 2*L; I < 2*N; I += 2*L) { auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); if (S.second != unsigned(L)) return OpRef::fail(); if (2*S.first != I) return OpRef::fail(); } // Check the -1s. for (int I = L; I < 2*N; I += 2*L) { auto S = findStrip(SM.Mask.drop_front(I), 0, N-I); if (S.first != -1 || S.second != unsigned(L)) return OpRef::fail(); } unsigned Opc = Strip.second == 1 ? Hexagon::V6_vunpackub : Hexagon::V6_vunpackuh; Results.push(Opc, getPairVT(MVT::i8), {Va}); return OpRef::res(Results.top()); } OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); // V6_vdeal{b,h} // V6_vshuff{b,h} // V6_vshufoe{b,h} those are quivalent to vshuffvdd(..,{1,2}) // V6_vshuffvdd (V6_vshuff) // V6_dealvdd (V6_vdeal) int VecLen = SM.Mask.size(); assert(isPowerOf2_32(VecLen) && Log2_32(VecLen) <= 8); unsigned LogLen = Log2_32(VecLen); unsigned HwLog = Log2_32(HwLen); // The result length must be the same as the length of a single vector, // or a vector pair. assert(LogLen == HwLog || LogLen == HwLog+1); bool Extend = (LogLen == HwLog); if (!isPermutation(SM.Mask)) return OpRef::fail(); SmallVector Perm(LogLen); // Check if this could be a perfect shuffle, or a combination of perfect // shuffles. // // Consider this permutation (using hex digits to make the ASCII diagrams // easier to read): // { 0, 8, 1, 9, 2, A, 3, B, 4, C, 5, D, 6, E, 7, F }. // This is a "deal" operation: divide the input into two halves, and // create the output by picking elements by alternating between these two // halves: // 0 1 2 3 4 5 6 7 --> 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F [*] // 8 9 A B C D E F // // Aside from a few special explicit cases (V6_vdealb, etc.), HVX provides // a somwehat different mechanism that could be used to perform shuffle/ // deal operations: a 2x2 transpose. // Consider the halves of inputs again, they can be interpreted as a 2x8 // matrix. A 2x8 matrix can be looked at four 2x2 matrices concatenated // together. Now, when considering 2 elements at a time, it will be a 2x4 // matrix (with elements 01, 23, 45, etc.), or two 2x2 matrices: // 01 23 45 67 // 89 AB CD EF // With groups of 4, this will become a single 2x2 matrix, and so on. // // The 2x2 transpose instruction works by transposing each of the 2x2 // matrices (or "sub-matrices"), given a specific group size. For example, // if the group size is 1 (i.e. each element is its own group), there // will be four transposes of the four 2x2 matrices that form the 2x8. // For example, with the inputs as above, the result will be: // 0 8 2 A 4 C 6 E // 1 9 3 B 5 D 7 F // Now, this result can be tranposed again, but with the group size of 2: // 08 19 4C 5D // 2A 3B 6E 7F // If we then transpose that result, but with the group size of 4, we get: // 0819 2A3B // 4C5D 6E7F // If we concatenate these two rows, it will be // 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F // which is the same as the "deal" [*] above. // // In general, a "deal" of individual elements is a series of 2x2 transposes, // with changing group size. HVX has two instructions: // Vdd = V6_vdealvdd Vu, Vv, Rt // Vdd = V6_shufvdd Vu, Vv, Rt // that perform exactly that. The register Rt controls which transposes are // going to happen: a bit at position n (counting from 0) indicates that a // transpose with a group size of 2^n will take place. If multiple bits are // set, multiple transposes will happen: vdealvdd will perform them starting // with the largest group size, vshuffvdd will do them in the reverse order. // // The main observation is that each 2x2 transpose corresponds to swapping // columns of bits in the binary representation of the values. // // The numbers {3,2,1,0} and the log2 of the number of contiguous 1 bits // in a given column. The * denote the columns that will be swapped. // The transpose with the group size 2^n corresponds to swapping columns // 3 (the highest log) and log2(n): // // 3 2 1 0 0 2 1 3 0 2 3 1 // * * * * * * // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 // 1 0 0 0 1 8 1 0 0 0 8 1 0 0 0 8 1 0 0 0 // 2 0 0 1 0 2 0 0 1 0 1 0 0 0 1 1 0 0 0 1 // 3 0 0 1 1 A 1 0 1 0 9 1 0 0 1 9 1 0 0 1 // 4 0 1 0 0 4 0 1 0 0 4 0 1 0 0 2 0 0 1 0 // 5 0 1 0 1 C 1 1 0 0 C 1 1 0 0 A 1 0 1 0 // 6 0 1 1 0 6 0 1 1 0 5 0 1 0 1 3 0 0 1 1 // 7 0 1 1 1 E 1 1 1 0 D 1 1 0 1 B 1 0 1 1 // 8 1 0 0 0 1 0 0 0 1 2 0 0 1 0 4 0 1 0 0 // 9 1 0 0 1 9 1 0 0 1 A 1 0 1 0 C 1 1 0 0 // A 1 0 1 0 3 0 0 1 1 3 0 0 1 1 5 0 1 0 1 // B 1 0 1 1 B 1 0 1 1 B 1 0 1 1 D 1 1 0 1 // C 1 1 0 0 5 0 1 0 1 6 0 1 1 0 6 0 1 1 0 // D 1 1 0 1 D 1 1 0 1 E 1 1 1 0 E 1 1 1 0 // E 1 1 1 0 7 0 1 1 1 7 0 1 1 1 7 0 1 1 1 // F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 auto XorPow2 = [] (ArrayRef Mask, unsigned Num) { unsigned X = Mask[0] ^ Mask[Num/2]; // Check that the first half has the X's bits clear. if ((Mask[0] & X) != 0) return 0u; for (unsigned I = 1; I != Num/2; ++I) { if (unsigned(Mask[I] ^ Mask[I+Num/2]) != X) return 0u; if ((Mask[I] & X) != 0) return 0u; } return X; }; // Create a vector of log2's for each column: Perm[i] corresponds to // the i-th bit (lsb is 0). assert(VecLen > 2); for (unsigned I = VecLen; I >= 2; I >>= 1) { // Examine the initial segment of Mask of size I. unsigned X = XorPow2(SM.Mask, I); if (!isPowerOf2_32(X)) return OpRef::fail(); // Check the other segments of Mask. for (int J = I; J < VecLen; J += I) { if (XorPow2(SM.Mask.slice(J, I), I) != X) return OpRef::fail(); } Perm[Log2_32(X)] = Log2_32(I)-1; } // Once we have Perm, represent it as cycles. Denote the maximum log2 // (equal to log2(VecLen)-1) as M. The cycle containing M can then be // written as (M a1 a2 a3 ... an). That cycle can be broken up into // simple swaps as (M a1)(M a2)(M a3)...(M an), with the composition // order being from left to right. Any (contiguous) segment where the // values ai, ai+1...aj are either all increasing or all decreasing, // can be implemented via a single vshuffvdd/vdealvdd respectively. // // If there is a cycle (a1 a2 ... an) that does not involve M, it can // be written as (M an)(a1 a2 ... an)(M a1). The first two cycles can // then be folded to get (M a1 a2 ... an)(M a1), and the above procedure // can be used to generate a sequence of vshuffvdd/vdealvdd. // // Example: // Assume M = 4 and consider a permutation (0 1)(2 3). It can be written // as (4 0 1)(4 0) composed with (4 2 3)(4 2), or simply // (4 0 1)(4 0)(4 2 3)(4 2). // It can then be expanded into swaps as // (4 0)(4 1)(4 0)(4 2)(4 3)(4 2), // and broken up into "increasing" segments as // [(4 0)(4 1)] [(4 0)(4 2)(4 3)] [(4 2)]. // This is equivalent to // (4 0 1)(4 0 2 3)(4 2), // which can be implemented as 3 vshufvdd instructions. using CycleType = SmallVector; std::set Cycles; std::set All; for (unsigned I : Perm) All.insert(I); // If the cycle contains LogLen-1, move it to the front of the cycle. // Otherwise, return the cycle unchanged. auto canonicalize = [LogLen](const CycleType &C) -> CycleType { unsigned LogPos, N = C.size(); for (LogPos = 0; LogPos != N; ++LogPos) if (C[LogPos] == LogLen-1) break; if (LogPos == N) return C; CycleType NewC(C.begin()+LogPos, C.end()); NewC.append(C.begin(), C.begin()+LogPos); return NewC; }; auto pfs = [](const std::set &Cs, unsigned Len) { // Ordering: shuff: 5 0 1 2 3 4, deal: 5 4 3 2 1 0 (for Log=6), // for bytes zero is included, for halfwords is not. if (Cs.size() != 1) return 0u; const CycleType &C = *Cs.begin(); if (C[0] != Len-1) return 0u; int D = Len - C.size(); if (D != 0 && D != 1) return 0u; bool IsDeal = true, IsShuff = true; for (unsigned I = 1; I != Len-D; ++I) { if (C[I] != Len-1-I) IsDeal = false; if (C[I] != I-(1-D)) // I-1, I IsShuff = false; } // At most one, IsDeal or IsShuff, can be non-zero. assert(!(IsDeal || IsShuff) || IsDeal != IsShuff); static unsigned Deals[] = { Hexagon::V6_vdealb, Hexagon::V6_vdealh }; static unsigned Shufs[] = { Hexagon::V6_vshuffb, Hexagon::V6_vshuffh }; return IsDeal ? Deals[D] : (IsShuff ? Shufs[D] : 0); }; while (!All.empty()) { unsigned A = *All.begin(); All.erase(A); CycleType C; C.push_back(A); for (unsigned B = Perm[A]; B != A; B = Perm[B]) { C.push_back(B); All.erase(B); } if (C.size() <= 1) continue; Cycles.insert(canonicalize(C)); } MVT SingleTy = getSingleVT(MVT::i8); MVT PairTy = getPairVT(MVT::i8); // Recognize patterns for V6_vdeal{b,h} and V6_vshuff{b,h}. if (unsigned(VecLen) == HwLen) { if (unsigned SingleOpc = pfs(Cycles, LogLen)) { Results.push(SingleOpc, SingleTy, {Va}); return OpRef::res(Results.top()); } } SmallVector SwapElems; if (HwLen == unsigned(VecLen)) SwapElems.push_back(LogLen-1); for (const CycleType &C : Cycles) { unsigned First = (C[0] == LogLen-1) ? 1 : 0; SwapElems.append(C.begin()+First, C.end()); if (First == 0) SwapElems.push_back(C[0]); } const SDLoc &dl(Results.InpNode); OpRef Arg = !Extend ? Va : concat(Va, OpRef::undef(SingleTy), Results); for (unsigned I = 0, E = SwapElems.size(); I != E; ) { bool IsInc = I == E-1 || SwapElems[I] < SwapElems[I+1]; unsigned S = (1u << SwapElems[I]); if (I < E-1) { while (++I < E-1 && IsInc == (SwapElems[I] < SwapElems[I+1])) S |= 1u << SwapElems[I]; // The above loop will not add a bit for the final SwapElems[I+1], // so add it here. S |= 1u << SwapElems[I]; } ++I; NodeTemplate Res; Results.push(Hexagon::A2_tfrsi, MVT::i32, { DAG.getTargetConstant(S, dl, MVT::i32) }); Res.Opc = IsInc ? Hexagon::V6_vshuffvdd : Hexagon::V6_vdealvdd; Res.Ty = PairTy; Res.Ops = { OpRef::hi(Arg), OpRef::lo(Arg), OpRef::res(-1) }; Results.push(Res); Arg = OpRef::res(Results.top()); } return !Extend ? Arg : OpRef::lo(Arg); } OpRef HvxSelector::butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); // Butterfly shuffles. // // V6_vdelta // V6_vrdelta // V6_vror // The assumption here is that all elements picked by Mask are in the // first operand to the vector_shuffle. This assumption is enforced // by the caller. MVT ResTy = getSingleVT(MVT::i8); PermNetwork::Controls FC, RC; const SDLoc &dl(Results.InpNode); int VecLen = SM.Mask.size(); for (int M : SM.Mask) { if (M != -1 && M >= VecLen) return OpRef::fail(); } // Try the deltas/benes for both single vectors and vector pairs. ForwardDeltaNetwork FN(SM.Mask); if (FN.run(FC)) { SDValue Ctl = getVectorConstant(FC, dl); Results.push(Hexagon::V6_vdelta, ResTy, {Va, OpRef(Ctl)}); return OpRef::res(Results.top()); } // Try reverse delta. ReverseDeltaNetwork RN(SM.Mask); if (RN.run(RC)) { SDValue Ctl = getVectorConstant(RC, dl); Results.push(Hexagon::V6_vrdelta, ResTy, {Va, OpRef(Ctl)}); return OpRef::res(Results.top()); } // Do Benes. BenesNetwork BN(SM.Mask); if (BN.run(FC, RC)) { SDValue CtlF = getVectorConstant(FC, dl); SDValue CtlR = getVectorConstant(RC, dl); Results.push(Hexagon::V6_vdelta, ResTy, {Va, OpRef(CtlF)}); Results.push(Hexagon::V6_vrdelta, ResTy, {OpRef::res(-1), OpRef(CtlR)}); return OpRef::res(Results.top()); } return OpRef::fail(); } SDValue HvxSelector::getVectorConstant(ArrayRef Data, const SDLoc &dl) { SmallVector Elems; for (uint8_t C : Data) Elems.push_back(DAG.getConstant(C, dl, MVT::i8)); MVT VecTy = MVT::getVectorVT(MVT::i8, Data.size()); SDValue BV = DAG.getBuildVector(VecTy, dl, Elems); SDValue LV = Lower.LowerOperation(BV, DAG); DAG.RemoveDeadNode(BV.getNode()); return LV; } void HvxSelector::selectShuffle(SDNode *N) { DEBUG_WITH_TYPE("isel", { dbgs() << "Starting " << __func__ << " on node:\n"; N->dump(&DAG); }); MVT ResTy = N->getValueType(0).getSimpleVT(); // Assume that vector shuffles operate on vectors of bytes. assert(ResTy.isVector() && ResTy.getVectorElementType() == MVT::i8); auto *SN = cast(N); std::vector Mask(SN->getMask().begin(), SN->getMask().end()); // This shouldn't really be necessary. Is it? for (int &Idx : Mask) if (Idx != -1 && Idx < 0) Idx = -1; unsigned VecLen = Mask.size(); bool HavePairs = (2*HwLen == VecLen); assert(ResTy.getSizeInBits() / 8 == VecLen); // Vd = vector_shuffle Va, Vb, Mask // bool UseLeft = false, UseRight = false; for (unsigned I = 0; I != VecLen; ++I) { if (Mask[I] == -1) continue; unsigned Idx = Mask[I]; assert(Idx < 2*VecLen); if (Idx < VecLen) UseLeft = true; else UseRight = true; } DEBUG_WITH_TYPE("isel", { dbgs() << "VecLen=" << VecLen << " HwLen=" << HwLen << " UseLeft=" << UseLeft << " UseRight=" << UseRight << " HavePairs=" << HavePairs << '\n'; }); // If the mask is all -1's, generate "undef". if (!UseLeft && !UseRight) { ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode()); DAG.RemoveDeadNode(N); return; } SDValue Vec0 = N->getOperand(0); SDValue Vec1 = N->getOperand(1); ResultStack Results(SN); Results.push(TargetOpcode::COPY, ResTy, {Vec0}); Results.push(TargetOpcode::COPY, ResTy, {Vec1}); OpRef Va = OpRef::res(Results.top()-1); OpRef Vb = OpRef::res(Results.top()); OpRef Res = !HavePairs ? shuffs2(ShuffleMask(Mask), Va, Vb, Results) : shuffp2(ShuffleMask(Mask), Va, Vb, Results); bool Done = Res.isValid(); if (Done) { // Make sure that Res is on the stack before materializing. Results.push(TargetOpcode::COPY, ResTy, {Res}); materialize(Results); } else { Done = scalarizeShuffle(Mask, SDLoc(N), ResTy, Vec0, Vec1, N); } if (!Done) { #ifndef NDEBUG dbgs() << "Unhandled shuffle:\n"; SN->dumpr(&DAG); #endif llvm_unreachable("Failed to select vector shuffle"); } } void HvxSelector::selectRor(SDNode *N) { // If this is a rotation by less than 8, use V6_valignbi. MVT Ty = N->getValueType(0).getSimpleVT(); const SDLoc &dl(N); SDValue VecV = N->getOperand(0); SDValue RotV = N->getOperand(1); SDNode *NewN = nullptr; if (auto *CN = dyn_cast(RotV.getNode())) { unsigned S = CN->getZExtValue(); if (S % HST.getVectorLength() == 0) { NewN = VecV.getNode(); } else if (isUInt<3>(S)) { SDValue C = DAG.getTargetConstant(S, dl, MVT::i32); NewN = DAG.getMachineNode(Hexagon::V6_valignbi, dl, Ty, {VecV, VecV, C}); } } if (!NewN) NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV}); ISel.ReplaceNode(N, NewN); DAG.RemoveDeadNode(N); } void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode *N) { HvxSelector(*this, *CurDAG).selectShuffle(N); } void HexagonDAGToDAGISel::SelectHvxRor(SDNode *N) { HvxSelector(*this, *CurDAG).selectRor(N); } void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) { const SDLoc &dl(N); SDValue Chain = N->getOperand(0); SDValue Address = N->getOperand(2); SDValue Predicate = N->getOperand(3); SDValue Base = N->getOperand(4); SDValue Modifier = N->getOperand(5); SDValue Offset = N->getOperand(6); unsigned Opcode; unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: llvm_unreachable("Unexpected HVX gather intrinsic."); case Intrinsic::hexagon_V6_vgathermhq: case Intrinsic::hexagon_V6_vgathermhq_128B: Opcode = Hexagon::V6_vgathermhq_pseudo; break; case Intrinsic::hexagon_V6_vgathermwq: case Intrinsic::hexagon_V6_vgathermwq_128B: Opcode = Hexagon::V6_vgathermwq_pseudo; break; case Intrinsic::hexagon_V6_vgathermhwq: case Intrinsic::hexagon_V6_vgathermhwq_128B: Opcode = Hexagon::V6_vgathermhwq_pseudo; break; } SDVTList VTs = CurDAG->getVTList(MVT::Other); SDValue Ops[] = { Address, Predicate, Base, Modifier, Offset, Chain }; SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops); MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast(N)->getMemOperand(); cast(Result)->setMemRefs(MemOp, MemOp + 1); ReplaceUses(N, Result); CurDAG->RemoveDeadNode(N); } void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { const SDLoc &dl(N); SDValue Chain = N->getOperand(0); SDValue Address = N->getOperand(2); SDValue Base = N->getOperand(3); SDValue Modifier = N->getOperand(4); SDValue Offset = N->getOperand(5); unsigned Opcode; unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: llvm_unreachable("Unexpected HVX gather intrinsic."); case Intrinsic::hexagon_V6_vgathermh: case Intrinsic::hexagon_V6_vgathermh_128B: Opcode = Hexagon::V6_vgathermh_pseudo; break; case Intrinsic::hexagon_V6_vgathermw: case Intrinsic::hexagon_V6_vgathermw_128B: Opcode = Hexagon::V6_vgathermw_pseudo; break; case Intrinsic::hexagon_V6_vgathermhw: case Intrinsic::hexagon_V6_vgathermhw_128B: Opcode = Hexagon::V6_vgathermhw_pseudo; break; } SDVTList VTs = CurDAG->getVTList(MVT::Other); SDValue Ops[] = { Address, Base, Modifier, Offset, Chain }; SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops); MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast(N)->getMemOperand(); cast(Result)->setMemRefs(MemOp, MemOp + 1); ReplaceUses(N, Result); CurDAG->RemoveDeadNode(N); } void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) { unsigned IID = cast(N->getOperand(0))->getZExtValue(); SDNode *Result; switch (IID) { case Intrinsic::hexagon_V6_vaddcarry: { SmallVector Ops = { N->getOperand(1), N->getOperand(2), N->getOperand(3) }; SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v512i1); Result = CurDAG->getMachineNode(Hexagon::V6_vaddcarry, SDLoc(N), VTs, Ops); break; } case Intrinsic::hexagon_V6_vaddcarry_128B: { SmallVector Ops = { N->getOperand(1), N->getOperand(2), N->getOperand(3) }; SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v1024i1); Result = CurDAG->getMachineNode(Hexagon::V6_vaddcarry, SDLoc(N), VTs, Ops); break; } case Intrinsic::hexagon_V6_vsubcarry: { SmallVector Ops = { N->getOperand(1), N->getOperand(2), N->getOperand(3) }; SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v512i1); Result = CurDAG->getMachineNode(Hexagon::V6_vsubcarry, SDLoc(N), VTs, Ops); break; } case Intrinsic::hexagon_V6_vsubcarry_128B: { SmallVector Ops = { N->getOperand(1), N->getOperand(2), N->getOperand(3) }; SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v1024i1); Result = CurDAG->getMachineNode(Hexagon::V6_vsubcarry, SDLoc(N), VTs, Ops); break; } default: llvm_unreachable("Unexpected HVX dual output intrinsic."); } ReplaceUses(N, Result); ReplaceUses(SDValue(N, 0), SDValue(Result, 0)); ReplaceUses(SDValue(N, 1), SDValue(Result, 1)); CurDAG->RemoveDeadNode(N); } Index: vendor/llvm/dist/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- vendor/llvm/dist/lib/Target/X86/X86ISelLowering.cpp (revision 327151) +++ vendor/llvm/dist/lib/Target/X86/X86ISelLowering.cpp (revision 327152) @@ -1,38505 +1,38511 @@ //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" #include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include using namespace llvm; #define DEBUG_TYPE "x86-isel" STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt ExperimentalVectorWideningLegalization( "x86-experimental-vector-widening-legalization", cl::init(false), cl::desc("Enable an experimental vector type legalization through widening " "rather than promotion."), cl::Hidden); static cl::opt ExperimentalPrefLoopAlignment( "x86-experimental-pref-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments " "(the last x86-experimental-pref-loop-alignment bits" " of the loop header PC will be 0)."), cl::Hidden); static cl::opt MulConstantOptimization( "mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden); /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without /// crashing. static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, const char *Msg) { MachineFunction &MF = DAG.getMachineFunction(); DAG.getContext()->diagnose( DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); } X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); X86ScalarSSEf64 = Subtarget.hasSSE2(); X86ScalarSSEf32 = Subtarget.hasSSE1(); MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the TargetLowering object. // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // For 64-bit, since we have so many registers, use the ILP scheduler. // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget.isAtom()) setSchedulingPreference(Sched::ILP); else if (Subtarget.is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides and use cheaper ones. if (TM.getOptLevel() >= CodeGenOpt::Default) { if (Subtarget.hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) addBypassSlowDiv(64, 32); } if (Subtarget.isTargetKnownWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); } if (Subtarget.isTargetDarwin()) { // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(false); setUseUnderscoreLongJmp(false); } else if (Subtarget.isTargetWindowsGNU()) { // MS runtime is weird: it exports _setjmp, but longjmp! setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(false); } else { setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); } // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget.is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // SETOEQ and SETUNE require checking two conditions. setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); // Integer absolute. if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); setOperationAction(ISD::ABS , MVT::i32 , Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::ABS , MVT::i64 , Custom); } // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget.is64Bit()) { if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) // f32/f64 are legal, f80 is custom. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); else setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!Subtarget.useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); if (!Subtarget.useSoftFloat()) { // SSE has no i16 to fp conversion, only i32. if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); if (!Subtarget.useSoftFloat()) { // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); if (X86ScalarSSEf32) { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); } // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget.is64Bit()) { if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } else { setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); } } else if (!Subtarget.useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget.hasSSE1() && !Subtarget.hasSSE3()) // Expand FP_TO_UINT into a select. // FIXME: We would like to use a Custom expander here eventually to do // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } else if (!Subtarget.is64Bit()) setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes // the two-result form to trivial CSE, which is able to combine x/y and x%y // into a single instruction. // // Scalar integer multiply-high is also lowered to use two-result // operations, to match the available instructions. However, plain multiply // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::BR_CC, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); } if (Subtarget.is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); if (!Subtarget.hasBMI()) { setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); if (Subtarget.is64Bit()) { setOperationAction(ISD::CTTZ , MVT::i64 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); } } if (Subtarget.hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); } } // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); if (!Subtarget.hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // These should be promoted to a larger select which is supported. setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } // Custom action for SELECT MMX and expand action for SELECT_CC MMX setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since // LLVM/Clang supports zero-cost DWARF and SEH exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); if (TM.Options.ExceptionModel == ExceptionHandling::SjLj) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); // Darwin ABI issue. for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::ConstantPool , VT, Custom); setOperationAction(ISD::JumpTable , VT, Custom); setOperationAction(ISD::GlobalAddress , VT, Custom); setOperationAction(ISD::GlobalTLSAddress, VT, Custom); setOperationAction(ISD::ExternalSymbol , VT, Custom); setOperationAction(ISD::BlockAddress , VT, Custom); } // 64-bit shl, sra, srl (iff 32-bit x86) for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SHL_PARTS, VT, Custom); setOperationAction(ISD::SRA_PARTS, VT, Custom); setOperationAction(ISD::SRL_PARTS, VT, Custom); } if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } if (Subtarget.hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() && TM.Options.ExceptionModel != ExceptionHandling::SjLj) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); bool Is64Bit = Subtarget.is64Bit(); setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass); addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass); for (auto VT : { MVT::f32, MVT::f64 }) { // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS, VT, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG, VT, Custom); // Use ANDPD and ORPD to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, VT, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); // Expand FP immediates into loads from the stack, except for the special // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps } else if (UseX87 && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f32, Custom); setOperationAction(ISD::UNDEF, MVT::f64, Expand); // Use ANDPS and ORPS to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Special cases we handle for FP constants. addLegalFPImmediate(APFloat(+0.0f)); // xorps addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } else if (UseX87) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); for (auto VT : { MVT::f32, MVT::f64 }) { setOperationAction(ISD::UNDEF, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS addLegalFPImmediate(APFloat(+0.0f)); // FLD0 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS } // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87, except f128 in MMX. if (UseX87) { if (Subtarget.is64Bit() && Subtarget.hasMMX()) { addRegisterClass(MVT::f128, &X86::FR128RegClass); ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); setOperationAction(ISD::FABS , MVT::f128, Custom); setOperationAction(ISD::FNEG , MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); } addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended()); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); } // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); setOperationAction(ISD::FMINNUM, MVT::f80, Expand); setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // Some FP actions are always expanded for vector types. for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); } // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like // types, we have to deal with them whether we ask for Expansion or not. // Setting Expand causes its own optimisation problems though, so leave // them legal. if (VT.getVectorElementType() == MVT::i1) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are // split/scalarized right now. if (VT.getVectorElementType() == MVT::f16) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i8, Custom); setOperationAction(ISD::MULHS, MVT::v16i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); setOperationAction(ISD::SMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v16i8, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); } for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } // We support custom legalizing of sext and anyext loads for specific // memory vector types which we can load as a scalar (or sequence of // scalars) and extend in-register to a legal 128-bit vector type. For sext // loads these must work with a single scalar load. for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); } for (auto VT : { MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); if (VT == MVT::v2i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationPromotedToType(ISD::AND, VT, MVT::v2i64); setOperationPromotedToType(ISD::OR, VT, MVT::v2i64); setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64); setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64); setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64); } // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { setOperationAction(ISD::ABS, MVT::v16i8, Legal); setOperationAction(ISD::ABS, MVT::v8i16, Legal); setOperationAction(ISD::ABS, MVT::v4i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); setOperationAction(ISD::FCEIL, RoundedTy, Legal); setOperationAction(ISD::FTRUNC, RoundedTy, Legal); setOperationAction(ISD::FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMAX, MVT::v4i32, Legal); setOperationAction(ISD::UMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v4i32, Legal); setOperationAction(ISD::SMIN, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v4i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); // We directly match byte blends in the backend as they match the VSELECT // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); } // i8 vectors are custom because the source register and source // source memory operand types are not the same width. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) setOperationAction(ISD::ROTL, VT, Custom); // XOP can efficiently perform BITREVERSE with VPPERM. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) setOperationAction(ISD::BITREVERSE, VT, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) setOperationAction(ISD::BITREVERSE, VT, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) { bool HasInt256 = Subtarget.hasInt256(); addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); for (auto VT : { MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); } setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); } setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); } if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::FMA, VT, Legal); } for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); } setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); } if (HasInt256) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom); // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); } } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); } // Custom lower several nodes for 256-bit types. for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } if (HasInt256) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationPromotedToType(ISD::AND, VT, MVT::v4i64); setOperationPromotedToType(ISD::OR, VT, MVT::v4i64); setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64); setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64); setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64); } if (HasInt256) { // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MGATHER, VT, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); setOperationAction(ISD::SELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom); // Extends of v16i1/v8i1 to 128-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom); for (auto VT : { MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); } setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1, MVT::v32i1, MVT::v64i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); } for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16, MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32, MVT::v8i64, MVT::v32i16, MVT::v64i8}) { MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom); setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom); setTruncStoreAction(VT, MaskVT, Custom); } for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v16i16, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::v16i8, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i8, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::v16i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); if (!Subtarget.hasVLX()) { // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE // to 512-bit rather than use the AVX2 instructions so that we can use // k-masks. for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom); } } setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); } setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom); // Without BWI we need to use custom lowering to handle MVT::v64i8 input. setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } // Need to promote to 64-bit even though we have 32-bit masked instructions // because the IR optimizers rearrange bitcasts around logic ops leaving // too many variations to handle if we don't promote them. setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64); setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64); setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64); if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } if (Subtarget.hasCDI()) { // NonVLX sub-targets extend 128/256 vectors to use the 512 version. for (auto VT : { MVT::v16i32, MVT::v8i64} ) { setOperationAction(ISD::CTLZ, VT, Legal); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v16i32, MVT::v8i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } // Extract subvector is special because the value type // (result) is 256-bit but the source is 512-bit wide. // 128-bit was made Legal under AVX1. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64); setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64); } }// has AVX-512 if (!Subtarget.useSoftFloat() && (Subtarget.hasAVX512() || Subtarget.hasVLX())) { // These operations are handled on non-VLX by artificially widening in // isel patterns. // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MSCATTER, VT, Custom); if (Subtarget.hasDQI()) { for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SINT_TO_FP, VT, Legal); setOperationAction(ISD::UINT_TO_FP, VT, Legal); setOperationAction(ISD::FP_TO_SINT, VT, Legal); setOperationAction(ISD::FP_TO_UINT, VT, Legal); setOperationAction(ISD::MUL, VT, Legal); } } if (Subtarget.hasCDI()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } } if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); for (auto VT : { MVT::v32i1, MVT::v64i1 }) { setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); // Extends from v32i1 masks to 256-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); // Extends from v64i1 masks to 512-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::MUL, MVT::v32i16, Legal); setOperationAction(ISD::MUL, MVT::v64i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i16, Legal); setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::MULHS, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v64i8, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationPromotedToType(ISD::AND, VT, MVT::v8i64); setOperationPromotedToType(ISD::OR, VT, MVT::v8i64); setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64); } for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); } if (Subtarget.hasBITALG()) { for (auto VT : { MVT::v64i8, MVT::v32i16 }) setOperationAction(ISD::CTPOP, VT, Legal); } } if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() && (Subtarget.hasAVX512() || Subtarget.hasVLX())) { for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); } // These operations are handled on non-VLX by artificially widening in // isel patterns. // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? if (Subtarget.hasBITALG()) { for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) setOperationAction(ISD::CTPOP, VT, Legal); } } if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); for (auto VT : { MVT::v2i1, MVT::v4i1 }) { setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } // TODO: v8i1 concat should be legal without VLX to support concats of // v1i1, but we won't legalize it correctly currently without introducing // a v4i1 concat in the middle. setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); // Extends from v2i1/v4i1 masks to 128-bit vectors. setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom); setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. // v2f32 UINT_TO_FP is already custom under SSE2. setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); } if (Subtarget.hasBWI()) { setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); } } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); // Support carry in as value rather than glue. setOperationAction(ISD::ADDCARRY, VT, Custom); setOperationAction(ISD::SUBCARRY, VT, Custom); setOperationAction(ISD::SETCCCARRY, VT, Custom); } if (!Subtarget.is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); setLibcallName(RTLIB::MUL_I128, nullptr); } // Combine sin / cos into _sincos_stret if it is available. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); setOperationAction(ISD::UREM, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i128, Custom); setOperationAction(ISD::UDIVREM, MVT::i128, Custom); } // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() || Subtarget.isTargetWindowsItanium())) for (ISD::NodeType Op : {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, ISD::FLOG10, ISD::FPOW, ISD::FSIN}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget.getRegisterInfo()); MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; // TODO: These control memcmp expansion in CGP and could be raised higher, but // that needs to benchmarked and balanced with the potential use of vector // load/store types (PR33329, PR33914). MaxLoadsPerMemcmp = 2; MaxLoadsPerMemcmpOptSize = 2; // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). setPrefLoopAlignment(ExperimentalPrefLoopAlignment); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. verifyIntrinsicTables(); } // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); } bool X86TargetLowering::useStackGuardXorFP() const { // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. return Subtarget.getTargetTriple().isOSMSVCRT(); } SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const { EVT PtrTy = getPointerTy(DAG.getDataLayout()); unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP; MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val); return SDValue(Node, 0); } TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { if (ExperimentalVectorWideningLegalization && VT.getVectorNumElements() != 1 && VT.getVectorElementType().getSimpleVT() != MVT::i1) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext& Context, EVT VT) const { if (!VT.isVector()) return MVT::i8; if (Subtarget.hasAVX512()) { const unsigned NumElts = VT.getVectorNumElements(); // Figure out what this type will be legalized to. EVT LegalVT = VT; while (getTypeAction(Context, LegalVT) != TypeLegal) LegalVT = getTypeToTransformTo(Context, LegalVT); // If we got a 512-bit vector then we'll definitely have a vXi1 compare. if (LegalVT.getSimpleVT().is512BitVector()) return EVT::getVectorVT(Context, MVT::i1, NumElts); if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { // If we legalized to less than a 512-bit vector, then we will use a vXi1 // compare for vXi32/vXi64 for sure. If we have BWI we will also support // vXi16/vXi8. MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) return EVT::getVectorVT(Context, MVT::i1, NumElts); } } return VT.changeVectorElementTypeToInteger(); } /// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) return; if (VectorType *VTy = dyn_cast(Ty)) { if (VTy->getBitWidth() == 128) MaxAlign = 16; } else if (ArrayType *ATy = dyn_cast(Ty)) { unsigned EltAlign = 0; getMaxByValAlign(ATy->getElementType(), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast(Ty)) { for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) break; } } } /// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, const DataLayout &DL) const { if (Subtarget.is64Bit()) { // Max of 8 and alignment of type. unsigned TyAlign = DL.getABITypeAlignment(Ty); if (TyAlign > 8) return TyAlign; return 8; } unsigned Align = 4; if (Subtarget.hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { const Function &F = MF.getFunction(); if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { // FIXME: Check if unaligned 32-byte accesses are slow. if (Size >= 32 && Subtarget.hasAVX()) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we // choose an optimal type with a vector element larger than a byte, // getMemsetStores() may create an intermediate splat (using an integer // multiply) before we splat as a vector. return MVT::v32i8; } if (Subtarget.hasSSE2()) return MVT::v16i8; // TODO: Can SSE1 handle a byte vector? if (Subtarget.hasSSE1()) return MVT::v4f32; } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. // Also, do not use f64 to lower memset unless this is a memset of zeros. // The gymnastics of splatting a byte value into an XMM register and then // only using 8-byte stores (because this is a CPU with slow unaligned // 16-byte accesses) makes that a loser. return MVT::f64; } } // This is a compromise. If we reach here, unaligned accesses may be slow on // this target. However, creating smaller, aligned accesses could be even // slower and would certainly be a lot more code. if (Subtarget.is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; } bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) return X86ScalarSSEf32; else if (VT == MVT::f64) return X86ScalarSSEf64; return true; } bool X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { if (Fast) { switch (VT.getSizeInBits()) { default: // 8-byte and under are always assumed to be fast. *Fast = true; break; case 128: *Fast = !Subtarget.isUnalignedMem16Slow(); break; case 256: *Fast = !Subtarget.isUnalignedMem32Slow(); break; // TODO: What about AVX-512 (512-bit) accesses? } } // Misaligned accesses of any size are always allowed. return true; } /// Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF // symbol. if (isPositionIndependent() && Subtarget.isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; // Otherwise, use the normal jump table encoding heuristics. return TargetLowering::getJumpTableEncoding(); } bool X86TargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, ArgListTy &Args) const { // Only relabel X86-32 for C / Stdcall CCs. if (Subtarget.is64Bit()) return; if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) return; unsigned ParamRegs = 0; if (auto *M = MF->getFunction().getParent()) ParamRegs = M->getNumberRegisterParameters(); // Mark the first N int arguments as having reg for (unsigned Idx = 0; Idx < Args.size(); Idx++) { Type *T = Args[Idx].Ty; if (T->isPointerTy() || T->isIntegerTy()) if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { unsigned numRegs = 1; if (MF->getDataLayout().getTypeAllocSize(T) > 4) numRegs = 2; if (ParamRegs < numRegs) return; ParamRegs -= numRegs; Args[Idx].IsInReg = true; } } } const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. return MCSymbolRefExpr::create(MBB->getSymbol(), MCSymbolRefExpr::VK_GOTOFF, Ctx); } /// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget.is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())); return Table; } /// This returns the relocation base for the given PIC jumptable, /// the same as getPICJumpTableRelocBase, but as an MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { // X86-64 uses RIP relative addressing based on the jump table label. if (Subtarget.isPICStyleRIPRel()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); } std::pair X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(TRI, VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; break; case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: case MVT::v16f32: case MVT::v8f64: RRC = &X86::VR128XRegClass; break; } return std::make_pair(RRC, Cost); } unsigned X86TargetLowering::getAddressSpace() const { if (Subtarget.is64Bit()) return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; return 256; } static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); } static Constant* SegmentOffset(IRBuilder<> &IRB, unsigned Offset, unsigned AddressSpace) { return ConstantExpr::getIntToPtr( ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); } Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { // glibc, bionic, and Fuchsia have a special slot for the stack guard in // tcbhead_t; use it instead of the usual global variable (see // sysdeps/{i386,x86_64}/nptl/tls.h) if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { if (Subtarget.isTargetFuchsia()) { // defines ZX_TLS_STACK_GUARD_OFFSET with this value. return SegmentOffset(IRB, 0x10, getAddressSpace()); } else { // %fs:0x28, unless we're using a Kernel code model, in which case // it's %gs:0x28. gs:0x14 on i386. unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; return SegmentOffset(IRB, Offset, getAddressSpace()); } } return TargetLowering::getIRStackGuard(IRB); } void X86TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. if (Subtarget.getTargetTriple().isOSMSVCRT()) { // MSVC CRT has a global variable holding security cookie. M.getOrInsertGlobal("__security_cookie", Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. auto *SecurityCheckCookie = cast( M.getOrInsertFunction("__security_check_cookie", Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext()))); SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall); SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); return; } // glibc, bionic, and Fuchsia have a special slot for the stack guard. if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) return; TargetLowering::insertSSPDeclarations(M); } Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { // MSVC CRT has a global variable holding security cookie. if (Subtarget.getTargetTriple().isOSMSVCRT()) return M.getGlobalVariable("__security_cookie"); return TargetLowering::getSDagStackGuard(M); } Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget.getTargetTriple().isOSMSVCRT()) return M.getFunction("__security_check_cookie"); return TargetLowering::getSSPStackGuardCheck(M); } Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (Subtarget.getTargetTriple().isOSContiki()) return getDefaultSafeStackPointerLocation(IRB, false); // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h if (Subtarget.isTargetAndroid()) { // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: // %gs:0x24 on i386 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; return SegmentOffset(IRB, Offset, getAddressSpace()); } // Fuchsia is similar. if (Subtarget.isTargetFuchsia()) { // defines ZX_TLS_UNSAFE_SP_OFFSET with this value. return SegmentOffset(IRB, 0x18, getAddressSpace()); } return TargetLowering::getSafeStackPointerLocation(IRB); } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); return SrcAS < 256 && DestAS < 256; } //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// #include "X86GenCallingConv.inc" bool X86TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } /// Lowers masks values (v*i1) to the local register values /// \returns DAG node after lowering to register type static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, const SDLoc &Dl, SelectionDAG &DAG) { EVT ValVT = ValArg.getValueType(); if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { // Two stage lowering might be required // bitcast: v8i1 -> i8 / v16i1 -> i16 // anyextend: i8 -> i32 / i16 -> i32 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); if (ValLoc == MVT::i32) ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy); return ValToCopy; } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { // One stage lowering is required // bitcast: v32i1 -> i32 / v64i1 -> i64 return DAG.getBitcast(ValLoc, ValArg); } else return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg); } /// Breaks v64i1 value into two registers and adds the new node to the DAG static void Passv64i1ArgInRegs( const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, SmallVector, 8> &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, const X86Subtarget &Subtarget) { assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); assert(VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"); // Before splitting the value we cast it to i64 Arg = DAG.getBitcast(MVT::i64, Arg); // Splitting the value into two i32 types SDValue Lo, Hi; Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, DAG.getConstant(0, Dl, MVT::i32)); Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, DAG.getConstant(1, Dl, MVT::i32)); // Attach the two i32 types into corresponding registers RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); } SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); // In some cases we need to disable registers from the default CSR list. // For example, when they are used for argument passing. bool ShouldDisableCalleeSavedRegister = CallConv == CallingConv::X86_RegCall || MF.getFunction().hasFnAttribute("no_caller_saved_registers"); if (CallConv == CallingConv::X86_INTR && !Outs.empty()) report_fatal_error("X86 interrupts may not return any value"); SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, MVT::i32)); // Copy the result values into the output registers. for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; ++I, ++OutsIndex) { CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); // Add the register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); SDValue ValToCopy = OutVals[OutsIndex]; EVT ValVT = ValToCopy.getValueType(); // Promote values to the appropriate types. if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); } else if (VA.getLocInfo() == CCValAssign::BCvt) ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } else if (ValVT == MVT::f64 && (Subtarget.is64Bit() && !Subtarget.hasSSE2())) { // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); RetOps.push_back(ValToCopy); // Don't emit a copytoreg. continue; } // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. if (Subtarget.is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget.hasSSE2()) ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); } } } SmallVector, 8> RegsToPass; if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I], Subtarget); assert(2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"); // Add the second register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); } else { RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); } // Add nodes to the DAG and add the values into the RetOps list for (auto &Reg : RegsToPass) { Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); } } // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. // All x86 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. // // Checking Function.hasStructRetAttr() here is insufficient because the IR // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is // false, then an sret argument may be implicitly inserted in the SelDAG. In // either case FuncInfo->setSRetReturnReg() will have been called. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { // When we have both sret and another return value, we should use the // original Chain stored in RetOps[0], instead of the current Chain updated // in the above loop. If we only have sret, RetOps[0] equals to Chain. // For the case of sret and another return value, we have // Chain_0 at the function entry // Chain_1 = getCopyToReg(Chain_0) in the above loop // If we use Chain_1 in getCopyFromReg, we will have // Val = getCopyFromReg(Chain_1) // Chain_2 = getCopyToReg(Chain_1, Val) from below // getCopyToReg(Chain_0) will be glued together with // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be // in Unit B, and we will have cyclic dependency between Unit A and Unit B: // Data dependency from Unit B to Unit A due to usage of Val in // getCopyToReg(Chain_1, Val) // Chain dependency from Unit A to Unit B // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, getPointerTy(MF.getDataLayout())); unsigned RetValReg = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); // Add the returned register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(RetValReg); } const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (X86::GR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); X86ISD::NodeType opcode = X86ISD::RET_FLAG; if (CallConv == CallingConv::X86_INTR) opcode = X86ISD::IRET; return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; // If we are returning more than one value, we can definitely // not make a tail call see PR19530 if (UI->getNumOperands() > 4) return false; if (UI->getNumOperands() == 4 && UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT = MVT::i32; bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { // The ABI does not require i1, i8 or i16 to be extended. // // On Darwin, there is code in the wild relying on Clang's old behaviour of // always extending i8/i16 return values, so keep doing that for now. // (PR26665). ReturnMVT = MVT::i8; } EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } /// Reads two 32 bit registers and creates a 64 bit mask value. /// \param VA The current 32 bit value that need to be assigned. /// \param NextVA The next 32 bit value that need to be assigned. /// \param Root The parent DAG node. /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for /// glue purposes. In the case the DAG is already using /// physical register instead of virtual, we should glue /// our new SDValue to InFlag SDvalue. /// \return a new SDvalue of size 64bit. static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, const SDLoc &Dl, const X86Subtarget &Subtarget, SDValue *InFlag = nullptr) { assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); assert(VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"); assert(NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"); assert(VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"); SDValue Lo, Hi; unsigned Reg; SDValue ArgValueLo, ArgValueHi; MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterClass *RC = &X86::GR32RegClass; // Read a 32 bit value from the registers if (nullptr == InFlag) { // When no physical register is present, // create an intermediate virtual register Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); } else { // When a physical register is available read the value from it and glue // the reads together. ArgValueLo = DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag); *InFlag = ArgValueLo.getValue(2); ArgValueHi = DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag); *InFlag = ArgValueHi.getValue(2); } // Convert the i32 type into v32i1 type Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); // Convert the i32 type into v32i1 type Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); // Concatenate the two values together return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); } /// The function will lower a register of various sizes (8/16/32/64) /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) /// \returns a DAG node contains the operand after lowering to mask type. static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, const EVT &ValLoc, const SDLoc &Dl, SelectionDAG &DAG) { SDValue ValReturned = ValArg; if (ValVT == MVT::v1i1) return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); if (ValVT == MVT::v64i1) { // In 32 bit machine, this case is handled by getv64i1Argument assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); // In 64 bit machine, There is no need to truncate the value only bitcast } else { MVT maskLen; switch (ValVT.getSimpleVT().SimpleTy) { case MVT::v8i1: maskLen = MVT::i8; break; case MVT::v16i1: maskLen = MVT::i16; break; case MVT::v32i1: maskLen = MVT::i32; break; default: llvm_unreachable("Expecting a vector of i1 types"); } ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); } return DAG.getBitcast(ValVT, ValReturned); } /// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue X86TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, uint32_t *RegMask) const { const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Assign locations to each value returned by this call. SmallVector RVLocs; bool Is64Bit = Subtarget.is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; ++I, ++InsIndex) { CCValAssign &VA = RVLocs[I]; EVT CopyVT = VA.getLocVT(); // In some calling conventions we need to remove the used registers // from the register mask. if (RegMask) { for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); } // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. bool RoundAfterCopy = false; if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { if (!Subtarget.hasX87()) report_fatal_error("X87 register return with X87 disabled"); CopyVT = MVT::f80; RoundAfterCopy = (CopyVT != VA.getLocVT()); } SDValue Val; if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); Val = getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag); } else { Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag) .getValue(1); Val = Chain.getValue(0); InFlag = Chain.getValue(2); } if (RoundAfterCopy) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1, dl)); if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) { if (VA.getValVT().isVector() && ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); } else Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); } InVals.push_back(Val); } return Chain; } //===----------------------------------------------------------------------===// // C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// // StdCall calling convention seems to be standard for many Windows' API // routines and around. It differs from C calling convention just a little: // callee should clean up the stack, not caller. Symbols should be also // decorated in some fancy way :) It doesn't support any vector arguments. // For info on fast calling convention see Fast Calling Convention (tail call) // implementation LowerX86_32FastCCCallTo. /// CallIsStructReturn - Determines whether a call uses struct return /// semantics. enum StructReturnType { NotStructReturn, RegStructReturn, StackStructReturn }; static StructReturnType callIsStructReturn(const SmallVectorImpl &Outs, bool IsMCU) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType argsAreStructReturn(const SmallVectorImpl &Ins, bool IsMCU) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Make a copy of an aggregate at address specified by "Src" to address /// "Dst" with size and alignment information specified by the specific /// parameter attribute. The copy will be passed as a byval function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile*/false, /*AlwaysInline=*/true, /*isTailCall*/false, MachinePointerInfo(), MachinePointerInfo()); } /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || CC == CallingConv::HHVM); } /// Return true if we might ever do TCO for calls with this calling convention. static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { // C calling conventions: case CallingConv::C: case CallingConv::Win64: case CallingConv::X86_64_SysV: // Callee pop conventions: case CallingConv::X86_ThisCall: case CallingConv::X86_StdCall: case CallingConv::X86_VectorCall: case CallingConv::X86_FastCall: return true; default: return canGuaranteeTCO(CC); } } /// Return true if the function is being made into a tailcall target by /// changing its ABI. static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && canGuaranteeTCO(CC); } bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; ImmutableCallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); if (!mayTailCallThisCC(CalleeCC)) return false; return true; } SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo &MFI, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = shouldGuaranteeTCO( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; MVT PtrVT = getPointerTy(DAG.getDataLayout()); // If value is passed by pointer we have address passed instead of the value // itself. No need to extend if the mask value and location share the same // absolute size. bool ExtendedInMem = VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) ValVT = VA.getLocVT(); else ValVT = VA.getValVT(); // Calculate SP offset of interrupt parameter, re-arrange the slot normally // taken by a return address. int Offset = 0; if (CallConv == CallingConv::X86_INTR) { // X86 interrupts may take one or two arguments. // On the stack there will be no return address as in regular call. // Offset of last argument need to be set to -4/-8 bytes. // Where offset of the first argument out of two, should be set to 0 bytes. Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); if (Subtarget.is64Bit() && Ins.size() == 2) { // The stack pointer needs to be realigned for 64 bit handlers with error // code, so the argument offset changes by 8 bytes. Offset += 8; } } // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. if (Flags.isByVal()) { unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); // Adjust SP offset of interrupt parameter. if (CallConv == CallingConv::X86_INTR) { MFI.setObjectOffset(FI, Offset); } return DAG.getFrameIndex(FI, PtrVT); } // This is an argument in memory. We might be able to perform copy elision. if (Flags.isCopyElisionCandidate()) { EVT ArgVT = Ins[i].ArgVT; SDValue PartAddr; if (Ins[i].PartOffset == 0) { // If this is a one-part value or the first part of a multi-part value, // create a stack object for the entire argument value type and return a // load from our portion of it. This assumes that if the first part of an // argument is in memory, the rest will also be in memory. int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), /*Immutable=*/false); PartAddr = DAG.getFrameIndex(FI, PtrVT); return DAG.getLoad( ValVT, dl, Chain, PartAddr, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); } else { // This is not the first piece of an argument in memory. See if there is // already a fixed stack object including this offset. If so, assume it // was created by the PartOffset == 0 branch above and create a load from // the appropriate offset into it. int64_t PartBegin = VA.getLocMemOffset(); int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; int FI = MFI.getObjectIndexBegin(); for (; MFI.isFixedObjectIndex(FI); ++FI) { int64_t ObjBegin = MFI.getObjectOffset(FI); int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) break; } if (MFI.isFixedObjectIndex(FI)) { SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); return DAG.getLoad( ValVT, dl, Chain, Addr, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, Ins[i].PartOffset)); } } } int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, VA.getLocMemOffset(), isImmutable); // Set SExt or ZExt flag. if (VA.getLocInfo() == CCValAssign::ZExt) { MFI.setObjectZExt(FI, true); } else if (VA.getLocInfo() == CCValAssign::SExt) { MFI.setObjectSExt(FI, true); } // Adjust SP offset of interrupt parameter. if (CallConv == CallingConv::X86_INTR) { MFI.setObjectOffset(FI, Offset); } SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); return ExtendedInMem ? (VA.getValVT().isVector() ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) : Val; } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentGPRs(CallingConv::ID CallConv, const X86Subtarget &Subtarget) { assert(Subtarget.is64Bit()); if (Subtarget.isCallingConvWin64(CallConv)) { static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); } static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, CallingConv::ID CallConv, const X86Subtarget &Subtarget) { assert(Subtarget.is64Bit()); if (Subtarget.isCallingConvWin64(CallConv)) { // The XMM registers which might contain var arg parameters are shadowed // in their paired GPR. So we only need to save the GPR to their home // slots. // TODO: __vectorcall will change this. return None; } const Function &F = MF.getFunction(); bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool isSoftFloat = Subtarget.useSoftFloat(); assert(!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1()) // Kernel mode asks for SSE to be disabled, so there are no XMM argument // registers. return None; static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } #ifndef NDEBUG static bool isSortedByValueNo(const SmallVectorImpl &ArgLocs) { return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), [](const CCValAssign &A, const CCValAssign &B) -> bool { return A.getValNo() < B.getValNo(); }); } #endif SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const Function &F = MF.getFunction(); if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && F.getName() == "main") FuncInfo->setForceFramePointer(true); MachineFrameInfo &MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget.is64Bit(); bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); assert( !(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); if (CallConv == CallingConv::X86_INTR) { bool isLegal = Ins.size() == 1 || (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || (!Is64Bit && Ins[1].VT == MVT::i32))); if (!isLegal) report_fatal_error("X86 interrupts may take one or two arguments"); } // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeArguments(Ins, CC_X86); // In vectorcall calling convention a second pass is required for the HVA // types. if (CallingConv::X86_VectorCall == CallConv) { CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); } // The next loop assumes that the locations are in the same order of the // input arguments. assert(isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"); SDValue ArgValue; for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; ++I, ++InsIndex) { assert(InsIndex < Ins.size() && "Invalid Ins index"); CCValAssign &VA = ArgLocs[I]; if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); if (VA.needsCustom()) { assert( VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); // v64i1 values, in regcall calling convention, that are // compiled to 32 bit arch, are split up into two registers. ArgValue = getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); } else { const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; else if (RegVT == MVT::f32) RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; else if (RegVT == MVT::f80) RC = &X86::RFP80RegClass; else if (RegVT == MVT::f128) RC = &X86::FR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; else if (RegVT == MVT::v1i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; else if (RegVT == MVT::v32i1) RC = &X86::VK32RegClass; else if (RegVT == MVT::v64i1) RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); } // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the // right size. if (VA.getLocInfo() == CCValAssign::SExt) ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::ZExt) ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); else if (VA.getValVT().isVector() && VA.getValVT().getScalarType() == MVT::i1 && ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); } else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { assert(VA.isMemLoc()); ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); } // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); InVals.push_back(ArgValue); } for (unsigned I = 0, E = Ins.size(); I != E; ++I) { // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. if (CallConv == CallingConv::Swift) continue; // All x86 ABIs require that for returning structs by value we copy the // sret argument into %rax/%eax (depending on ABI) for the return. Save // the argument into a virtual register so that we can access it from the // return points. if (Ins[I].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); break; } } unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. if (shouldGuaranteeTCO(CallConv, MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. We // can skip this if there are no va_start calls. if (MFI.hasVAStart() && (Is64Bit || (CallConv != CallingConv::X86_FastCall && CallConv != CallingConv::X86_ThisCall))) { FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true)); } // Figure out if XMM registers are in use. assert(!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. if (Is64Bit && isVarArg && MFI.hasVAStart()) { // Find the first unallocated argument registers. ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. SmallVector LiveGPRs; SmallVector LiveXMMRegs; SDValue ALVal; for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); LiveGPRs.push_back( DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); } if (!ArgXMMs.empty()) { unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); LiveXMMRegs.push_back( DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); } } if (IsWin64) { // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; FuncInfo->setRegSaveFrameIndex( MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); // Fixup to set vararg frame on shadow area (4 x i64). if (NumIntRegs < 4) FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so // they may be loaded by dereferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); } // Store the integer parameter registers. SmallVector MemOps; SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy(DAG.getDataLayout())); unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (SDValue Val : LiveGPRs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), Offset)); MemOps.push_back(Store); Offset += 8; } if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { // Now store the XMM (fp + vector) parameter registers. SmallVector SaveXMMOps; SaveXMMOps.push_back(Chain); SaveXMMOps.push_back(ALVal); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getRegSaveFrameIndex(), dl)); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getVarArgsFPOffset(), dl)); SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), LiveXMMRegs.end()); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } if (isVarArg && MFI.hasMustTailInVarArgFunc()) { // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. if (Subtarget.hasAVX512() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || CallConv == CallingConv::Intel_OCL_BI))) VecVT = MVT::v16f32; else if (Subtarget.hasAVX()) VecVT = MVT::v8f32; else if (Subtarget.hasSSE2()) VecVT = MVT::v4f32; // We forward some GPRs and some vector types. SmallVector RegParmTypes; MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; RegParmTypes.push_back(IntVT); if (VecVT != MVT::Other) RegParmTypes.push_back(VecVT); // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); // Conservatively forward AL on x86_64, since it might be used for varargs. if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); } // Copy all forwards from physical to virtual registers. for (ForwardedRegister &F : Forwards) { // FIXME: Can we use a less constrained schedule? SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); } } // Some CCs need callee pop. if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { // X86 interrupts must pop the error code (and the alignment padding) if // present. FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget.getTargetTriple().isOSMSVCRT() && argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } if (!Is64Bit) { // RegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } FuncInfo->setArgumentStackSize(StackSize); if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); if (Personality == EHPersonality::CoreCLR) { assert(Is64Bit); // TODO: Add a mechanism to frame lowering that will allow us to indicate // that we'd prefer this slot be allocated towards the bottom of the frame // (i.e. near the stack pointer after allocating the frame). Every // funclet needs a copy of this slot in its (mostly empty) frame, and the // offset from the bottom of this and each funclet's frame must be the // same, so the size of funclets' (mostly empty) frames is dictated by // how far this slot is from the bottom (since they allocate just enough // space to accommodate holding this slot at the correct offset). int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false); EHInfo->PSPSymFrameIdx = PSPSymFI; } } if (CallConv == CallingConv::X86_RegCall || F.hasFnAttribute("no_caller_saved_registers")) { MachineRegisterInfo &MRI = MF.getRegInfo(); for (std::pair Pair : MRI.liveins()) MRI.disableCalleeSavedRegister(Pair.first); } return Chain; } SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); return DAG.getStore( Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); } /// Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr( SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, const SDLoc &dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(DAG.getDataLayout()); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); return SDValue(OutRetAddr.getNode(), 1); } /// Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, unsigned SlotSize, int FPDiff, const SDLoc &dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), NewReturnAddrFI)); return Chain; } /// Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector Mask; Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) Mask.push_back(i); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; bool &isTailCall = CLI.IsTailCall; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget.is64Bit(); bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo(); auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); const auto *CI = dyn_cast_or_null(CLI.CS.getInstruction()); const Function *Fn = CI ? CI->getCalledFunction() : nullptr; bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) || (Fn && Fn->hasFnAttribute("no_caller_saved_registers")); if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); if (Attr.getValueAsString() == "true") isTailCall = false; if (Subtarget.isPICStyleGOT() && !MF.getTarget().Options.GuaranteedTailCallOpt) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT // relocation, which forces early binding of the symbol. This breaks code // that require lazy function symbol resolution. Using musttail or // GuaranteedTailCallOpt will override this. GlobalAddressSDNode *G = dyn_cast(Callee); if (!G || (!G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility())) isTailCall = false; } bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure // that we can lower this successfully without moving the return address // around. isTailCall = true; } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, MF.getFunction().hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require // ABI changes. if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) ++NumTailCalls; } assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeArguments(Outs, CC_X86); // In vectorcall calling convention a second pass is required for the HVA // types. if (CallingConv::X86_VectorCall == CallConv) { CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); } // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (MF.getTarget().Options.GuaranteedTailCallOpt && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; // Set the delta of movement of the returnaddr stackslot. // But only set if delta is greater than previous delta. if (FPDiff < X86Info->getTCReturnAddrDelta()) X86Info->setTCReturnAddrDelta(FPDiff); } unsigned NumBytesToPush = NumBytes; unsigned NumBytesToPop = NumBytes; // If we have an inalloca argument, all stack space has already been allocated // for us and be right at the top of the stack. We don't support multiple // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; if (!ArgLocs.back().isMemLoc()) report_fatal_error("cannot use inalloca attribute on a register " "parameter"); if (ArgLocs.back().getLocMemOffset() != 0) report_fatal_error("any parameter with the inalloca attribute must be " "the only memory argument"); } if (!IsSibcall) Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, NumBytes - NumBytesToPush, dl); SDValue RetAddrFrIdx; // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); SmallVector, 8> RegsToPass; SmallVector MemOpChains; SDValue StackPtr; // The next loop assumes that the locations are in the same order of the // input arguments. assert(isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"); // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutIndex) { assert(OutIndex < Outs.size() && "Invalid Out index"); // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; if (Flags.isInAlloca()) continue; CCValAssign &VA = ArgLocs[I]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[OutIndex]; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: if (Arg.getValueType().isVector() && Arg.getValueType().getVectorElementType() == MVT::i1) Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getBitcast(MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: Arg = DAG.getBitcast(RegVT, Arg); break; case CCValAssign::Indirect: { // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast(SpillSlot)->getIndex(); Chain = DAG.getStore( Chain, dl, Arg, SpillSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); Arg = SpillSlot; break; } } if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); // Split v64i1 value into two registers Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. unsigned ShadowReg = 0; switch (VA.getLocReg()) { case X86::XMM0: ShadowReg = X86::RCX; break; case X86::XMM1: ShadowReg = X86::RDX; break; case X86::XMM2: ShadowReg = X86::R8; break; case X86::XMM3: ShadowReg = X86::R9; break; } if (ShadowReg) RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget.isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair( unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of // the tail jump. This is done to circumvent the ebx/callee-saved problem // for tail calls on PIC/GOT architectures. Normally we would just put the // address of GOT into ebx and then call target@PLT. But for tail calls // ebx would be restored (since ebx is callee saved) before jumping to the // target@PLT. // Note: The actual moving to ECX is done further down. GlobalAddressSDNode *G = dyn_cast(Callee); if (G && !G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility()) Callee = LowerGlobalAddress(Callee, DAG); else if (isa(Callee)) Callee = LowerExternalSymbol(Callee, DAG); } } if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in // the declaration) %al is used as hidden argument to specify the number // of SSE registers used. The contents of %al do not need to match exactly // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); RegsToPass.push_back(std::make_pair(unsigned(X86::AL), DAG.getConstant(NumXMMRegs, dl, MVT::i8))); } if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); } } // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and // the alias isn't otherwise explicit. This is slightly more conservative // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); SmallVector MemOpChains2; SDValue FIN; int FI = 0; for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutsIndex) { CCValAssign &VA = ArgLocs[I]; if (VA.isRegLoc()) { if (VA.needsCustom()) { assert((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"); // This means that we are in special case where one argument was // passed through two register locations - Skip the next location ++I; } continue; } assert(VA.isMemLoc()); SDValue Arg = OutVals[OutsIndex]; ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; // Skip inalloca arguments. They don't require any work. if (Flags.isInAlloca()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, ArgChain, Flags, DAG, dl)); } else { // Store relative to framepointer. MemOpChains2.push_back(DAG.getStore( ArgChain, dl, Arg, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); } } if (!MemOpChains2.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, getPointerTy(DAG.getDataLayout()), RegInfo->getSlotSize(), FPDiff, dl); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into registers. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (DAG.getTarget().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. } else if (Callee->getOpcode() == ISD::GlobalAddress) { // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack // it. GlobalAddressSDNode* G = cast(Callee); // We should use extra load for direct calls to dllimported functions in // non-JIT mode. const GlobalValue *GV = G->getGlobal(); if (!GV->hasDLLImportStorageClass()) { unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV); Callee = DAG.getTargetGlobalAddress( GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); if (OpFlags == X86II::MO_GOTPCREL) { // Add a wrapper. Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(DAG.getDataLayout()), Callee); // Add extra indirection Callee = DAG.getLoad( getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(DAG.getMachineFunction())); } } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(nullptr, *Mod); Callee = DAG.getTargetExternalSymbol( S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); } else if (Subtarget.isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector Ops; if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } Ops.push_back(Chain); Ops.push_back(Callee); if (isTailCall) Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we // set X86_INTR calling convention because it has the same CSR mask // (same preserved registers). const uint32_t *Mask = RegInfo->getCallPreservedMask( MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv); assert(Mask && "Missing call preserved mask for calling convention"); // If this is an invoke in a 32-bit function using a funclet-based // personality, assume the function clobbers all registers. If an exception // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) { const Function &CallerFn = MF.getFunction(); EHPersonality Pers = CallerFn.hasPersonalityFn() ? classifyEHPersonality(CallerFn.getPersonalityFn()) : EHPersonality::Unknown; if (isFuncletEHPersonality(Pers)) Mask = RegInfo->getNoPreservedMask(); } // Define a new register mask from the existing mask. uint32_t *RegMask = nullptr; // In some calling conventions we need to remove the used physical registers // from the reg mask. if (CallConv == CallingConv::X86_RegCall || HasNCSR) { const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Allocate a new Reg Mask and copy Mask. RegMask = MF.allocateRegisterMask(TRI->getNumRegs()); unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32; memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize); // Make sure all sub registers of the argument registers are reset // in the RegMask. for (auto const &RegPair : RegsToPass) for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); // Create the RegMask Operand according to our updated mask. Ops.push_back(DAG.getRegisterMask(RegMask)); } else { // Create the RegMask Operand according to the static mask. Ops.push_back(DAG.getRegisterMask(Mask)); } if (InFlag.getNode()) Ops.push_back(InFlag); if (isTailCall) { // We used to do: //// If this is the first return lowered for this function, add the regs //// to the liveout set for the function. // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. MF.getFrameInfo().setHasTailCall(); return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget.getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPop = 4; else NumBytesForCalleeToPop = 0; // Callee pops nothing. if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) { // No need to reset the stack after the call if the call doesn't return. To // make the MI verify, we'll pretend the callee does it for us. NumBytesForCalleeToPop = NumBytes; } // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals, RegMask); } //===----------------------------------------------------------------------===// // Fast Calling Convention (tail call) implementation //===----------------------------------------------------------------------===// // Like std call, callee cleans arguments, convention except that ECX is // reserved for storing the tail called function address. Only 2 registers are // free for argument passing (inreg). Tail call optimization is performed // provided: // * tailcallopt is enabled // * caller/callee are fastcc // On X86_64 architecture with GOT-style position independent code only local // (within module) calls are supported at the moment. // To keep the stack aligned according to platform abi the function // GetAlignedArgumentStackSize ensures that argument delta is always multiples // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the // original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 // arg2 // RETADDR // [ new RETADDR // move area ] // (possible EBP) // ESI // EDI // local1 .. /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align /// requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; unsigned SlotSize = RegInfo->getSlotSize(); if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { // Number smaller than 12 so just add the difference. Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); } else { // Mask out lower bits, add stackalignment once plus the 12 bytes. Offset = ((~AlignMask) & Offset) + StackAlignment + (StackAlignment-SlotSize); } return Offset; } /// Return true if the given stack call argument is already available in the /// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII, const CCValAssign &VA) { unsigned Bytes = Arg.getValueSizeInBits() / 8; for (;;) { // Look through nodes that don't alter the bits of the incoming value. unsigned Op = Arg.getOpcode(); if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) { Arg = Arg.getOperand(0); continue; } if (Op == ISD::TRUNCATE) { const SDValue &TruncInput = Arg.getOperand(0); if (TruncInput.getOpcode() == ISD::AssertZext && cast(TruncInput.getOperand(1))->getVT() == Arg.getValueType()) { Arg = TruncInput.getOperand(0); continue; } } break; } int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast(Arg.getOperand(1))->getReg(); if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(*Def, FI)) return false; } else { unsigned Opcode = Def->getOpcode(); if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); Bytes = Flags.getByValSize(); } else return false; } } else if (LoadSDNode *Ld = dyn_cast(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { FrameIndexSDNode *FINode = cast(Arg); FI = FINode->getIndex(); Bytes = Flags.getByValSize(); } else return false; assert(FI != INT_MAX); if (!MFI.isFixedObjectIndex(FI)) return false; if (Offset != MFI.getObjectOffset(FI)) return false; // If this is not byval, check that the argument stack object is immutable. // inalloca and argument copy elision can create mutable argument stack // objects. Byval objects can be mutated, but a byval call intends to pass the // mutated memory. if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) return false; if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) { // If the argument location is wider than the argument type, check that any // extension flags match. if (Flags.isZExt() != MFI.isObjectZExt(FI) || Flags.isSExt() != MFI.isObjectSExt(FI)) { return false; } } return Bytes == MFI.getObjectSize(FI); } /// Check whether the call is eligible for tail call optimization. Targets /// that want to do tail call optimization should implement this function. bool X86TargetLowering::IsEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { if (!mayTailCallThisCC(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to // perform a tailcall optimization here. if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) return false; CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this // space. if (IsCalleeWin64 != IsCallerWin64) return false; if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; } // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); if (RegInfo->needsStackRealignment(MF)) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. LLVMContext &C = *DAG.getContext(); if (isVarArg && !Outs.empty()) { // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) return false; SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) if (!ArgLocs[i].isRegLoc()) return false; } // If the call result is in ST0 / ST1, it needs to be popped off the x87 // stack. Therefore, if it's not used by the call it is not safe to optimize // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { Unused = true; break; } } if (Unused) { SmallVector RVLocs; CCState CCInfo(CalleeCC, false, MF, RVLocs, C); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; } } // Check that the call results are passed in the same way. if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, RetCC_X86, RetCC_X86)) return false; // The callee has to preserve all registers the caller needs to preserve. const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } unsigned StackArgsSize = 0; // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); // Allocate shadow area for Win64 if (IsCalleeWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); StackArgsSize = CCInfo.getNextStackOffset(); if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII, VA)) return false; } } } bool PositionIndependent = isPositionIndependent(); // If the tailcall address may be in a register, then make sure it's // possible to register allocate for it. In 32-bit, the call address can // only target EAX, EDX, or ECX since the tail call must be scheduled after // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. if (!Subtarget.is64Bit() && ((!isa(Callee) && !isa(Callee)) || PositionIndependent)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = PositionIndependent ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; unsigned Reg = VA.getLocReg(); switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: if (++NumInRegs == MaxInRegs) return false; break; } } } const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) return false; } bool CalleeWillPop = X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt); if (unsigned BytesToPop = MF.getInfo()->getBytesToPopOnReturn()) { // If we have bytes to pop, the callee must pop them. bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; if (!CalleePopMatches) return false; } else if (CalleeWillPop && StackArgsSize > 0) { // If we don't have bytes to pop, make sure the callee doesn't pop any. return false; } return true; } FastISel * X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return X86::createFastISel(funcInfo, libInfo); } //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// static bool MayFoldLoad(SDValue Op) { return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); } static bool MayFoldIntoStore(SDValue Op) { return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } static bool MayFoldIntoZeroExtend(SDValue Op) { if (Op.hasOneUse()) { unsigned Opcode = Op.getNode()->use_begin()->getOpcode(); return (ISD::ZERO_EXTEND == Opcode); } return false; } static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: case X86ISD::MOVLHPS: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VBROADCAST: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::VPERMIL2: case X86ISD::VPERMI: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VPERMIV3: case X86ISD::VZEXT_MOVL: return true; } } static bool isTargetShuffleVariableMask(unsigned Opcode) { switch (Opcode) { default: return false; // Target Shuffles. case X86ISD::PSHUFB: case X86ISD::VPERMILPV: case X86ISD::VPERMIL2: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VPERMIV3: return true; // 'Faux' Target Shuffles. case ISD::AND: case X86ISD::ANDNP: return true; } } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra // restrictions. if (!hasSymbolicDisplacement) return true; // FIXME: Some tweaks might be needed for medium code model. if (M != CodeModel::Small && M != CodeModel::Kernel) return false; // For small code model we assume that latest object is 16MB before end of 31 // bits boundary. We may also accept pretty large negative constants knowing // that all objects are in the positive half of address space. if (M == CodeModel::Small && Offset < 16*1024*1024) return true; // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. if (M == CodeModel::Kernel && Offset >= 0) return true; return false; } /// Determines whether the callee is required to pop its own arguments. /// Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { // If GuaranteeTCO is true, we force some calls to be callee pop so that we // can guarantee TCO. if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) return true; switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::X86_VectorCall: return !is64Bit; } } /// \brief Return true if the condition is an unsigned comparison operation. static bool isX86CCUnsigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); case X86::COND_E: case X86::COND_NE: case X86::COND_B: case X86::COND_A: case X86::COND_BE: case X86::COND_AE: return true; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: return false; } } static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { switch (SetCCOpcode) { default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; case ISD::SETGE: return X86::COND_GE; case ISD::SETLT: return X86::COND_L; case ISD::SETLE: return X86::COND_LE; case ISD::SETNE: return X86::COND_NE; case ISD::SETULT: return X86::COND_B; case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; } } /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; } if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; } } return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { default: break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: case ISD::SETUGE: std::swap(LHS, RHS); break; } // On a floating point condition, the flags are set as follows: // ZF PF CF op // 0 | 0 | 0 | X > Y // 0 | 0 | 1 | X < Y // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; case ISD::SETOLT: // flipped case ISD::SETOGT: case ISD::SETGT: return X86::COND_A; case ISD::SETOLE: // flipped case ISD::SETOGE: case ISD::SETGE: return X86::COND_AE; case ISD::SETUGT: // flipped case ISD::SETULT: case ISD::SETLT: return X86::COND_B; case ISD::SETUGE: // flipped case ISD::SETULE: case ISD::SETLE: return X86::COND_BE; case ISD::SETONE: case ISD::SETNE: return X86::COND_NE; case ISD::SETUO: return X86::COND_P; case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; } } /// Is there a floating point cmov for the specific X86 condition code? /// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { default: return false; case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_P: case X86::COND_A: case X86::COND_AE: case X86::COND_NE: case X86::COND_NP: return true; } } bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); if (!IntrData) return false; Info.opc = ISD::INTRINSIC_W_CHAIN; Info.flags = MachineMemOperand::MONone; Info.offset = 0; switch (IntrData->Type) { case EXPAND_FROM_MEM: { Info.ptrVal = I.getArgOperand(0); Info.memVT = MVT::getVT(I.getType()); Info.align = 1; Info.flags |= MachineMemOperand::MOLoad; break; } case COMPRESS_TO_MEM: { Info.ptrVal = I.getArgOperand(0); Info.memVT = MVT::getVT(I.getArgOperand(1)->getType()); Info.align = 1; Info.flags |= MachineMemOperand::MOStore; break; } case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { Info.ptrVal = I.getArgOperand(0); MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; if (IntrData->Type == TRUNCATE_TO_MEM_VI8) ScalarVT = MVT::i8; else if (IntrData->Type == TRUNCATE_TO_MEM_VI16) ScalarVT = MVT::i16; else if (IntrData->Type == TRUNCATE_TO_MEM_VI32) ScalarVT = MVT::i32; Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); Info.align = 1; Info.flags |= MachineMemOperand::MOStore; break; } default: return false; } return true; } /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; } return false; } bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast(Load)->getBasePtr(); if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; return true; } /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0 || BitSize > 64) return false; return true; } bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { // TODO: It might be a win to ease or lift this restriction, but the generic // folds in DAGCombiner conflict with vector folds for an AVX512 target. if (VT.isVector() && Subtarget.hasAVX512()) return false; return true; } bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; // Mask vectors support all subregister combinations and operations that // extract half of vector. if (ResVT.getVectorElementType() == MVT::i1) return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) && (Index == ResVT.getVectorNumElements())); return (Index % ResVT.getVectorNumElements()) == 0; } bool X86TargetLowering::isCheapToSpeculateCttz() const { // Speculate cttz only if we can directly use TZCNT. return Subtarget.hasBMI(); } bool X86TargetLowering::isCheapToSpeculateCtlz() const { // Speculate ctlz only if we can directly use LZCNT. return Subtarget.hasLZCNT(); } bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const { // Do not merge to float value size (128 bytes) if no implicit // float attribute is set. bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (NoFloat) { unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; return (MemVT.getSizeInBits() <= MaxIntSize); } return true; } bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { return true; } bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { if (!Subtarget.hasBMI()) return false; // There are only 32-bit and 64-bit forms for 'andn'. EVT VT = Y.getValueType(); if (VT != MVT::i32 && VT != MVT::i64) return false; return true; } MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { MVT VT = MVT::getIntegerVT(NumBits); if (isTypeLegal(VT)) return VT; // PMOVMSKB can handle this. if (NumBits == 128 && isTypeLegal(MVT::v16i8)) return MVT::v16i8; // VPMOVMSKB can handle this. if (NumBits == 256 && isTypeLegal(MVT::v32i8)) return MVT::v32i8; // TODO: Allow 64-bit type for 32-bit target. // TODO: 512-bit types should be allowed, but make sure that those // cases are handled in combineVectorSizedSetCCEquality(). return MVT::INVALID_SIMPLE_VALUE_TYPE; } /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); } /// Val is either the undef or zero sentinel value. static bool isUndefOrZero(int Val) { return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) if (Mask[i] != SM_SentinelUndef) return false; return true; } /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi); } /// Return true if every element in Mask is undef or if its value /// falls within the specified range (L, H]. static bool isUndefOrInRange(ArrayRef Mask, int Low, int Hi) { for (int M : Mask) if (!isUndefOrInRange(M, Low, Hi)) return false; return true; } /// Return true if Val is undef, zero or if its value falls within the /// specified range (L, H]. static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { return isUndefOrZero(Val) || (Val >= Low && Val < Hi); } /// Return true if every element in Mask is undef, zero or if its value /// falls within the specified range (L, H]. static bool isUndefOrZeroOrInRange(ArrayRef Mask, int Low, int Hi) { for (int M : Mask) if (!isUndefOrZeroOrInRange(M, Low, Hi)) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size], or is undef or is zero. static bool isSequentialOrUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low) if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef or is zero. static bool isUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) if (!isUndefOrZero(Mask[i])) return false; return true; } /// \brief Helper function to test whether a shuffle mask could be /// simplified by widening the elements being shuffled. /// /// Appends the mask for wider elements in WidenedMask if valid. Otherwise /// leaves it in an unspecified state. /// /// NOTE: This must handle normal vector shuffle masks and *target* vector /// shuffle masks. The latter have the special property of a '-2' representing /// a zero-ed lane of a vector. static bool canWidenShuffleElements(ArrayRef Mask, SmallVectorImpl &WidenedMask) { WidenedMask.assign(Mask.size() / 2, 0); for (int i = 0, Size = Mask.size(); i < Size; i += 2) { int M0 = Mask[i]; int M1 = Mask[i + 1]; // If both elements are undef, its trivial. if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { WidenedMask[i / 2] = SM_SentinelUndef; continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { WidenedMask[i / 2] = M1 / 2; continue; } if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { WidenedMask[i / 2] = M0 / 2; continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { WidenedMask[i / 2] = SM_SentinelZero; continue; } return false; } // Finally check if the two mask values are adjacent and aligned with // a pair. if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { WidenedMask[i / 2] = M0 / 2; continue; } // Otherwise we can't safely widen the elements used in this shuffle. return false; } assert(WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"); return true; } /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); } // Build a vector of constants. // Use an UNDEF node if MaskElt == -1. // Split 64-bit constants in the 32-bit mode. static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask = false) { SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0; i < NumElts; ++i) { bool IsUndef = Values[i] < 0 && IsMask; SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT); Ops.push_back(OpNode); if (Split) Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(0, dl, EltVT)); } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); if (Split) ConstsNode = DAG.getBitcast(VT, ConstsNode); return ConstsNode; } static SDValue getConstVector(ArrayRef Bits, APInt &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"); SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0, e = Bits.size(); i != e; ++i) { if (Undefs[i]) { Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); continue; } const APInt &V = Bits[i]; assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"); if (Split) { Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT)); Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT)); } else if (EltVT == MVT::f32) { APFloat FV(APFloat::IEEEsingle(), V); Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else if (EltVT == MVT::f64) { APFloat FV(APFloat::IEEEdouble(), V); Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else { Ops.push_back(DAG.getConstant(V, dl, EltVT)); } } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); return DAG.getBitcast(VT, ConstsNode); } /// Returns a vector of specified type with all zero elements. static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"); // Try to build SSE/AVX zero vectors as bitcasted to their dest // type. This ensures they get CSE'd. But if the integer type is not // available, use a floating-point +0.0 instead. SDValue Vec; if (!Subtarget.hasSSE2() && VT.is128BitVector()) { Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) && "Unexpected vector type"); Vec = DAG.getConstant(0, dl, VT); } else { unsigned Num32BitElts = VT.getSizeInBits() / 32; Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts)); } return DAG.getBitcast(VT, Vec); } static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector(ResultVT, dl, Vec->ops().slice(IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 256); } static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.isUndef()) return Result; EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } // Return true if the instruction zeroes the unused upper part of the // destination and accepts mask. static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { switch (Opcode) { default: return false; case X86ISD::TESTM: case X86ISD::TESTNM: case X86ISD::PCMPEQM: case X86ISD::PCMPGTM: case X86ISD::CMPM: case X86ISD::CMPMU: case X86ISD::CMPM_RND: return true; } } /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); if (!isa(Idx)) return SDValue(); // Inserting undef is a nop. We can just return the original vector. if (SubVec.isUndef()) return Vec; unsigned IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; MVT OpVT = Op.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); // Extend to natively supported kshift. MVT WideOpVT = OpVT; if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts // if necessary. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { // May need to promote to a legal type. Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), SubVec, Idx); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } MVT SubVecVT = SubVec.getSimpleValueType(); unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); assert(IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); SDValue Undef = DAG.getUNDEF(WideOpVT); if (IdxVal == 0) { // Zero lower bits of the Vec SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), SubVec, ZeroIdx); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, SubVec, ZeroIdx); if (Vec.isUndef()) { assert(IdxVal != 0 && "Unexpected index"); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } if (ISD::isBuildVectorAllZeros(Vec.getNode())) { assert(IdxVal != 0 && "Unexpected index"); NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getConstant(ShiftLeft, dl, MVT::i8)); if (ShiftRight != 0) SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getConstant(ShiftRight, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); if (SubVecNumElems * 2 == NumElems) { // Special case, use legal zero extending insert_subvector. This allows // isel to opimitize when bits are known zero. Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), Vec, ZeroIdx); } else { // Otherwise use explicit shifts to zero the bits. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); NumElems = WideOpVT.getVectorNumElements(); SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); } Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } // Inserting into the middle is more complicated. NumElems = WideOpVT.getVectorNumElements(); // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); // Move the current value of the bit to be replace to the lsbs. Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, DAG.getConstant(IdxVal, dl, MVT::i8)); // Xor with the new bit. Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); // Shift to MSB, filling bottom bits with 0. unsigned ShiftLeft = NumElems - SubVecNumElems; Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, DAG.getConstant(ShiftLeft, dl, MVT::i8)); // Shift to the final position, filling upper bits with 0. unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, DAG.getConstant(ShiftRight, dl, MVT::i8)); // Xor with original vector leaving the new value. Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); // Reduce to original width if needed. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower /// large BUILD_VECTORS. static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, const SDLoc &dl) { SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return insert128BitVector(V, V2, NumElems / 2, DAG, dl); } static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, const SDLoc &dl) { SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return insert256BitVector(V, V2, NumElems / 2, DAG, dl); } /// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"); APInt Ones = APInt::getAllOnesValue(32); unsigned NumElts = VT.getSizeInBits() / 32; SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); return DAG.getBitcast(VT, Vec); } static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode"); if (VT.is128BitVector() && InVT.is128BitVector()) return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT) : DAG.getZeroExtendVectorInReg(In, DL, VT); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) { int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); In = extractSubVector(In, 0, DAG, DL, std::max(128, (int)VT.getSizeInBits() / Scale)); } return DAG.getNode(Opc, DL, VT, In); } /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Return a vector_shuffle of the specified vector of zero or undef vector. /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); int NumElems = VT.getVectorNumElements(); SmallVector MaskVec(NumElems); for (int i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec[i] = (i == Idx) ? NumElems : i; return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); } static SDValue peekThroughBitcasts(SDValue V) { while (V.getNode() && V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); return V; } static SDValue peekThroughOneUseBitcasts(SDValue V) { while (V.getNode() && V.getOpcode() == ISD::BITCAST && V.getOperand(0).hasOneUse()) V = V.getOperand(0); return V; } static const Constant *getTargetConstantFromNode(SDValue Op) { Op = peekThroughBitcasts(Op); auto *Load = dyn_cast(Op); if (!Load) return nullptr; SDValue Ptr = Load->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *CNode = dyn_cast(Ptr); if (!CNode || CNode->isMachineConstantPoolEntry()) return nullptr; return dyn_cast(CNode->getConstVal()); } // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl &EltBits, bool AllowWholeUndefs = true, bool AllowPartialUndefs = true) { assert(EltBits.empty() && "Expected an empty EltBits vector"); Op = peekThroughBitcasts(Op); EVT VT = Op.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; // Bitcast a source array of element bits to the target size. auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef SrcEltBits) { unsigned NumSrcElts = UndefSrcElts.getBitWidth(); unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth(); assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"); // Don't split if we don't allow undef bits. bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; if (UndefSrcElts.getBoolValue() && !AllowUndefs) return false; // If we're already the right size, don't bother bitcasting. if (NumSrcElts == NumElts) { UndefElts = UndefSrcElts; EltBits.assign(SrcEltBits.begin(), SrcEltBits.end()); return true; } // Extract all the undef/constant element data and pack into single bitsets. APInt UndefBits(SizeInBits, 0); APInt MaskBits(SizeInBits, 0); for (unsigned i = 0; i != NumSrcElts; ++i) { unsigned BitOffset = i * SrcEltSizeInBits; if (UndefSrcElts[i]) UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); MaskBits.insertBits(SrcEltBits[i], BitOffset); } // Split the undef/constant single bitset data into the target elements. UndefElts = APInt(NumElts, 0); EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); for (unsigned i = 0; i != NumElts; ++i) { unsigned BitOffset = i * EltSizeInBits; APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); // Only treat an element as UNDEF if all bits are UNDEF. if (UndefEltBits.isAllOnesValue()) { if (!AllowWholeUndefs) return false; UndefElts.setBit(i); continue; } // If only some bits are UNDEF then treat them as zero (or bail if not // supported). if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) return false; APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset); EltBits[i] = Bits.getZExtValue(); } return true; }; // Collect constant bits and insert into mask/undef bit masks. auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, unsigned UndefBitIndex) { if (!Cst) return false; if (isa(Cst)) { Undefs.setBit(UndefBitIndex); return true; } if (auto *CInt = dyn_cast(Cst)) { Mask = CInt->getValue(); return true; } if (auto *CFP = dyn_cast(Cst)) { Mask = CFP->getValueAPF().bitcastToAPInt(); return true; } return false; }; // Handle UNDEFs. if (Op.isUndef()) { APInt UndefSrcElts = APInt::getAllOnesValue(NumElts); SmallVector SrcEltBits(NumElts, APInt(EltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract scalar constant bits. if (auto *Cst = dyn_cast(Op)) { APInt UndefSrcElts = APInt::getNullValue(1); SmallVector SrcEltBits(1, Cst->getAPIntValue()); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from build vector. if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { const SDValue &Src = Op.getOperand(i); if (Src.isUndef()) { UndefSrcElts.setBit(i); continue; } auto *Cst = cast(Src); SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits); } return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from constant pool vector. if (auto *Cst = getTargetConstantFromNode(Op)) { Type *CstTy = Cst->getType(); if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits())) return false; unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits(); unsigned NumSrcElts = CstTy->getVectorNumElements(); APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0; i != NumSrcElts; ++i) if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i], UndefSrcElts, i)) return false; return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from a broadcasted constant pool scalar. if (Op.getOpcode() == X86ISD::VBROADCAST && EltSizeInBits <= VT.getScalarSizeInBits()) { if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) { unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) { if (UndefSrcElts[0]) UndefSrcElts.setBits(0, NumSrcElts); SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); return CastBitData(UndefSrcElts, SrcEltBits); } } } // Extract a rematerialized scalar constant insertion. if (Op.getOpcode() == X86ISD::VZEXT_MOVL && Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && isa(Op.getOperand(0).getOperand(0))) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits; auto *CN = cast(Op.getOperand(0).getOperand(0)); SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits)); SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } return false; } static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl &RawMask) { APInt UndefElts; SmallVector EltBits; // Extract the raw target constant bits. // FIXME: We currently don't support UNDEF bits or mask entries. if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, EltBits, /* AllowWholeUndefs */ false, /* AllowPartialUndefs */ false)) return false; // Insert the extracted elements into the mask. for (APInt Elt : EltBits) RawMask.push_back(Elt.getZExtValue()); return true; } /// Create a shuffle mask that matches the PACKSS/PACKUS truncation. /// Note: This ignores saturation, so inputs must be checked first. static void createPackShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Unary) { assert(Mask.empty() && "Expected an empty shuffle mask vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits(); unsigned Offset = Unary ? 0 : NumElts; for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) Mask.push_back(Elt + (Lane * NumEltsPerLane)); for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); } } /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. /// Sets \p IsUnary to true if only one source is used. Note that this will set /// IsUnary for shuffles which use a single input multiple times, and in those /// cases it will adjust the mask to only have indices within that single input. /// It is an error to call this with non-empty Mask/Ops vectors. static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, SmallVectorImpl &Ops, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); IsUnary = false; bool IsFakeUnary = false; switch(N->getOpcode()) { case X86ISD::BLENDI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeBLENDMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUFP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::INSERTPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeINSERTPSMask(cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::EXTRQI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); if (isa(N->getOperand(1)) && isa(N->getOperand(2))) { int BitLen = N->getConstantOperandVal(1); int BitIdx = N->getConstantOperandVal(2); DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask); IsUnary = true; } break; case X86ISD::INSERTQI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); if (isa(N->getOperand(2)) && isa(N->getOperand(3))) { int BitLen = N->getConstantOperandVal(2); int BitIdx = N->getConstantOperandVal(3); DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); } break; case X86ISD::UNPCKH: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKLMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); Ops.push_back(N->getOperand(1)); Ops.push_back(N->getOperand(0)); break; case X86ISD::VSHLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSLLDQMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::VSRLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSRLDQMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFHW: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFHWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFLWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::VZEXT_MOVL: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeZeroMoveLowMask(VT, Mask); IsUnary = true; break; case X86ISD::VBROADCAST: { SDValue N0 = N->getOperand(0); // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so, // add the pre-extracted value to the Ops vector. if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.getOperand(0).getValueType() == VT && N0.getConstantOperandVal(1) == 0) Ops.push_back(N0.getOperand(0)); // We only decode broadcasts of same-sized vectors, unless the broadcast // came from an extract from the original width. If we found one, we // pushed it the Ops vector above. if (N0.getValueType() == VT || !Ops.empty()) { DecodeVectorBroadcast(VT, Mask); IsUnary = true; break; } return false; } case X86ISD::VPERMILPV: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N->getOperand(1); unsigned MaskEltSize = VT.getScalarSizeInBits(); SmallVector RawMask; if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) { DecodeVPERMILPMask(VT, RawMask, Mask); break; } if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodeVPERMILPMask(C, MaskEltSize, Mask); break; } return false; } case X86ISD::PSHUFB: { assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N->getOperand(1); SmallVector RawMask; if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) { DecodePSHUFBMask(RawMask, Mask); break; } if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodePSHUFBMask(C, Mask); break; } return false; } case X86ISD::VPERMI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); break; case X86ISD::VPERM2X128: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSLDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVSHDUP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSHDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVDDUP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVDDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVLPD: case X86ISD::MOVLPS: // Not yet implemented return false; case X86ISD::VPERMIL2: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); unsigned MaskEltSize = VT.getScalarSizeInBits(); SDValue MaskNode = N->getOperand(2); SDValue CtrlNode = N->getOperand(3); if (ConstantSDNode *CtrlOp = dyn_cast(CtrlNode)) { unsigned CtrlImm = CtrlOp->getZExtValue(); SmallVector RawMask; if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) { DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask); break; } if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask); break; } } return false; } case X86ISD::VPPERM: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); SDValue MaskNode = N->getOperand(2); SmallVector RawMask; if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) { DecodeVPPERMMask(RawMask, Mask); break; } if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodeVPPERMMask(C, Mask); break; } return false; } case X86ISD::VPERMV: { assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. Ops.push_back(N->getOperand(1)); SDValue MaskNode = N->getOperand(0); SmallVector RawMask; unsigned MaskEltSize = VT.getScalarSizeInBits(); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) { DecodeVPERMVMask(RawMask, Mask); break; } if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodeVPERMVMask(C, MaskEltSize, Mask); break; } return false; } case X86ISD::VPERMV3: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(2).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2); // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. Ops.push_back(N->getOperand(0)); Ops.push_back(N->getOperand(2)); SDValue MaskNode = N->getOperand(1); unsigned MaskEltSize = VT.getScalarSizeInBits(); if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodeVPERMV3Mask(C, MaskEltSize, Mask); break; } return false; } case X86ISD::VPERMIV3: { assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(2).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2); // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one. Ops.push_back(N->getOperand(1)); Ops.push_back(N->getOperand(2)); SDValue MaskNode = N->getOperand(0); unsigned MaskEltSize = VT.getScalarSizeInBits(); if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodeVPERMV3Mask(C, MaskEltSize, Mask); break; } return false; } default: llvm_unreachable("unknown target shuffle node"); } // Empty mask indicates the decode failed. if (Mask.empty()) return false; // Check if we're getting a shuffle mask with zero'd elements. if (!AllowSentinelZero) if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return false; // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. if (IsFakeUnary) for (int &M : Mask) if (M >= (int)Mask.size()) M -= Mask.size(); // If we didn't already add operands in the opcode-specific code, default to // adding 1 or 2 operands starting at 0. if (Ops.empty()) { Ops.push_back(N->getOperand(0)); if (!IsUnary || IsFakeUnary) Ops.push_back(N->getOperand(1)); } return true; } /// Check a target shuffle mask's inputs to see if we can set any values to /// SM_SentinelZero - this is for elements that are known to be zero /// (not just zeroable) from their inputs. /// Returns true if the target shuffle mask was decoded. static bool setTargetShuffleZeroElements(SDValue N, SmallVectorImpl &Mask, SmallVectorImpl &Ops) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; MVT VT = N.getSimpleValueType(); if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary)) return false; SDValue V1 = Ops[0]; SDValue V2 = IsUnary ? V1 : Ops[1]; V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); assert((VT.getSizeInBits() % Mask.size()) == 0 && "Illegal split of shuffle value type"); unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size(); // Extract known constant input data. APInt UndefSrcElts[2]; SmallVector SrcEltBits[2]; bool IsSrcConstant[2] = { getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], SrcEltBits[0], true, false), getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], SrcEltBits[1], true, false)}; for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; // Already decoded as SM_SentinelZero / SM_SentinelUndef. if (M < 0) continue; // Determine shuffle input and normalize the mask. unsigned SrcIdx = M / Size; SDValue V = M < Size ? V1 : V2; M %= Size; // We are referencing an UNDEF input. if (V.isUndef()) { Mask[i] = SM_SentinelUndef; continue; } // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. // TODO: We currently only set UNDEF for integer types - floats use the same // registers as vectors and many of the scalar folded loads rely on the // SCALAR_TO_VECTOR pattern. if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && (Size % V.getValueType().getVectorNumElements()) == 0) { int Scale = Size / V.getValueType().getVectorNumElements(); int Idx = M / Scale; if (Idx != 0 && !VT.isFloatingPoint()) Mask[i] = SM_SentinelUndef; else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) Mask[i] = SM_SentinelZero; continue; } // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) Mask[i] = SM_SentinelUndef; else if (SrcEltBits[SrcIdx][M] == 0) Mask[i] = SM_SentinelZero; } } assert(VT.getVectorNumElements() == Mask.size() && "Different mask size from vector size!"); return true; } // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the // destination value type. static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, SmallVectorImpl &Ops, SelectionDAG &DAG) { Mask.clear(); Ops.clear(); MVT VT = N.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); unsigned NumSizeInBits = VT.getSizeInBits(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && "Expected byte aligned value types"); unsigned Opcode = N.getOpcode(); switch (Opcode) { case ISD::AND: case X86ISD::ANDNP: { // Attempt to decode as a per-byte mask. APInt UndefElts; SmallVector EltBits; SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); bool IsAndN = (X86ISD::ANDNP == Opcode); uint64_t ZeroMask = IsAndN ? 255 : 0; if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) return false; for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { if (UndefElts[i]) { Mask.push_back(SM_SentinelUndef); continue; } uint64_t ByteBits = EltBits[i].getZExtValue(); if (ByteBits != 0 && ByteBits != 255) return false; Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); } Ops.push_back(IsAndN ? N1 : N0); return true; } case ISD::SCALAR_TO_VECTOR: { // Match against a scalar_to_vector of an extract from a vector, // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar. SDValue N0 = N.getOperand(0); SDValue SrcExtract; if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && N0.getOperand(0).getValueType() == VT) || (N0.getOpcode() == X86ISD::PEXTRW && N0.getOperand(0).getValueType() == MVT::v8i16) || (N0.getOpcode() == X86ISD::PEXTRB && N0.getOperand(0).getValueType() == MVT::v16i8)) { SrcExtract = N0; } if (!SrcExtract || !isa(SrcExtract.getOperand(1))) return false; SDValue SrcVec = SrcExtract.getOperand(0); EVT SrcVT = SrcVec.getValueType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1; unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); if (NumSrcElts <= SrcIdx) return false; Ops.push_back(SrcVec); Mask.push_back(SrcIdx); Mask.append(NumZeros, SM_SentinelZero); Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef); return true; } case X86ISD::PINSRB: case X86ISD::PINSRW: { SDValue InVec = N.getOperand(0); SDValue InScl = N.getOperand(1); uint64_t InIdx = N.getConstantOperandVal(2); assert(InIdx < NumElts && "Illegal insertion index"); // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern. if (X86::isZeroNode(InScl)) { Ops.push_back(InVec); for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i); return true; } // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern. // TODO: Expand this to support INSERT_VECTOR_ELT/etc. unsigned ExOp = (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW); if (InScl.getOpcode() != ExOp) return false; SDValue ExVec = InScl.getOperand(0); uint64_t ExIdx = InScl.getConstantOperandVal(1); assert(ExIdx < NumElts && "Illegal extraction index"); Ops.push_back(InVec); Ops.push_back(ExVec); for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(i == InIdx ? NumElts + ExIdx : i); return true; } case X86ISD::PACKSS: case X86ISD::PACKUS: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && "Unexpected input value type"); // If we know input saturation won't happen we can treat this // as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) || (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) || (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask))) return false; } bool IsUnary = (N0 == N1); Ops.push_back(N0); if (!IsUnary) Ops.push_back(N1); createPackShuffleMask(VT, Mask, IsUnary); return true; } case X86ISD::VSHLI: case X86ISD::VSRLI: { uint64_t ShiftVal = N.getConstantOperandVal(1); // Out of range bit shifts are guaranteed to be zero. if (NumBitsPerElt <= ShiftVal) { Mask.append(NumElts, SM_SentinelZero); return true; } // We can only decode 'whole byte' bit shifts as shuffles. if ((ShiftVal % 8) != 0) break; uint64_t ByteShift = ShiftVal / 8; unsigned NumBytes = NumSizeInBits / 8; unsigned NumBytesPerElt = NumBitsPerElt / 8; Ops.push_back(N.getOperand(0)); // Clear mask to all zeros and insert the shifted byte indices. Mask.append(NumBytes, SM_SentinelZero); if (X86ISD::VSHLI == Opcode) { for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j] = i + j - ByteShift; } else { for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j - ByteShift] = i + j; } return true; } case ISD::ZERO_EXTEND_VECTOR_INREG: case X86ISD::VZEXT: { // TODO - add support for VPMOVZX with smaller input vector types. SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); if (NumSizeInBits != SrcVT.getSizeInBits()) break; DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask); Ops.push_back(Src); return true; } } return false; } /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly. static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, SmallVectorImpl &Mask) { int MaskWidth = Mask.size(); SmallVector UsedInputs; for (int i = 0, e = Inputs.size(); i < e; ++i) { int lo = UsedInputs.size() * MaskWidth; int hi = lo + MaskWidth; // Strip UNDEF input usage. if (Inputs[i].isUndef()) for (int &M : Mask) if ((lo <= M) && (M < hi)) M = SM_SentinelUndef; // Check for unused inputs. if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { UsedInputs.push_back(Inputs[i]); continue; } for (int &M : Mask) if (lo <= M) M -= MaskWidth; } Inputs = UsedInputs; } /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the /// remaining input indices in case we now have a unary shuffle and adjust the /// inputs accordingly. /// Returns true if the target shuffle mask was decoded. static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, SelectionDAG &DAG) { if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) if (!getFauxShuffleMask(Op, Mask, Inputs, DAG)) return false; resolveTargetShuffleInputsAndMask(Inputs, Mask); return true; } /// Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. SDValue V = SDValue(N, 0); EVT VT = V.getValueType(); unsigned Opcode = V.getOpcode(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast(N)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); unsigned NumElems = VT.getVectorNumElements(); SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); MVT ShufSVT = ShufVT.getVectorElementType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; SmallVector ShuffleOps; bool IsUnary; if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt == SM_SentinelZero) return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT) : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT); if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufSVT); assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); unsigned NumElems = VT.getVectorNumElements(); if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) return SDValue(); } if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? V.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Index); return SDValue(); } // Use PINSRB/PINSRW/PINSRD to create a build vector. static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && "Illegal vector insertion"); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < NumElts; ++i) { bool IsNonZero = (NonZeros & (1 << i)) != 0; if (!IsNonZero) continue; // If the build vector contains zeros or our first insertion is not the // first index then insert into zero vector to break any register // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. if (First) { First = false; if (NumZero || 0 != i) V = getZeroVector(VT, Subtarget, DAG, dl); else { assert(0 == i && "Expected insertion into zero-index"); V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); V = DAG.getBitcast(VT, V); continue; } } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return V; } /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 8 && !Subtarget.hasSSE41()) return SDValue(); // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget.hasSSE41()) return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget); SDLoc dl(Op); SDValue V; bool First = true; // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { if (NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; } if ((i & 1) != 0) { // FIXME: Investigate extending to i32 instead of just i16. // FIXME: Investigate combining the first 4 bytes as a i32 instead. SDValue ThisElt, LastElt; bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0; if (LastIsNonZero) { LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1)); } if (ThisIsNonZero) { ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, DAG.getConstant(8, dl, MVT::i8)); if (LastIsNonZero) ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else ThisElt = LastElt; if (ThisElt) { if (1 == i) { V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); V = DAG.getBitcast(MVT::v8i16, V); } else { V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, DAG.getIntPtrConstant(i / 2, dl)); } } } } return DAG.getBitcast(MVT::v16i8, V); } /// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 4 && !Subtarget.hasSSE41()) return SDValue(); // Use PINSRW to insert each byte directly. return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget); } /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Find all zeroable elements. std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { SDValue Elt = Op->getOperand(i); Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; for (unsigned i=0; i < 4; ++i) { if (Zeroable[i]) continue; SDValue Elt = Op->getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Elt.getOperand(1))) return SDValue(); // Make sure that this node is extracting from a 128-bit vector. MVT VT = Elt.getOperand(0).getSimpleValueType(); if (!VT.is128BitVector()) return SDValue(); if (!FirstNonZero.getNode()) { FirstNonZero = Elt; FirstNonZeroIdx = i; } } assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); SDValue V1 = FirstNonZero.getOperand(0); MVT VT = V1.getSimpleValueType(); // See if this build_vector can be lowered as a blend with zero. SDValue Elt; unsigned EltMaskIdx, EltIdx; int Mask[4]; for (EltIdx = 0; EltIdx < 4; ++EltIdx) { if (Zeroable[EltIdx]) { // The zero vector will be on the right hand side. Mask[EltIdx] = EltIdx+4; continue; } Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. EltMaskIdx = Elt.getConstantOperandVal(1); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; } if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getBitcast(VT, V1); return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask); } // See if we can lower this build_vector to a INSERTPS. if (!Subtarget.hasSSE41()) return SDValue(); SDValue V2 = Elt.getOperand(0); if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) V1 = SDValue(); bool CanFold = true; for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { if (Zeroable[i]) continue; SDValue Current = Op->getOperand(i); SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i); } if (!CanFold) return SDValue(); assert(V1.getNode() && "Expected at least two non-zero elements!"); if (V1.getSimpleValueType() != MVT::v4f32) V1 = DAG.getBitcast(MVT::v4f32, V1); if (V2.getSimpleValueType() != MVT::v4f32) V2 = DAG.getBitcast(MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. unsigned ZMask = Zeroable.to_ulong(); unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDLoc DL(Op); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getIntPtrConstant(InsertPSMask, DL)); return DAG.getBitcast(VT, Result); } /// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); MVT ShVT = MVT::v16i8; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. if (LoadSDNode *LD = dyn_cast(SrcOp)) { SDValue Ptr = LD->getBasePtr(); if (!ISD::isNormalLoad(LD) || LD->isVolatile()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) return SDValue(); int FI = -1; int64_t Offset = 0; if (FrameIndexSDNode *FINode = dyn_cast(Ptr)) { FI = FINode->getIndex(); Offset = 0; } else if (DAG.isBaseWithConstantOffset(Ptr) && isa(Ptr.getOperand(0))) { FI = cast(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); Ptr = Ptr.getOperand(0); } else { return SDValue(); } // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { if (MFI.isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { MFI.setObjectAlignment(FI, RequiredAlign); } } // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(StartOffset, DL, Ptr.getValueType())); } int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset)); SmallVector Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask); } return SDValue(); } /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example: -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool isAfterLegalize) { unsigned NumElems = Elts.size(); int LastLoadedElt = -1; SmallBitVector LoadMask(NumElems, false); SmallBitVector ZeroMask(NumElems, false); SmallBitVector UndefMask(NumElems, false); // For each element in the initializer, see if we've found a load, zero or an // undef. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = peekThroughBitcasts(Elts[i]); if (!Elt.getNode()) return SDValue(); if (Elt.isUndef()) UndefMask[i] = true; else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) ZeroMask[i] = true; else if (ISD::isNON_EXTLoad(Elt.getNode())) { LoadMask[i] = true; LastLoadedElt = i; // Each loaded element must be the correct fractional portion of the // requested vector load. if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) return SDValue(); } else return SDValue(); } assert((ZeroMask | UndefMask | LoadMask).count() == NumElems && "Incomplete element masks"); // Handle Special Cases - all undef or undef/zero. if (UndefMask.count() == NumElems) return DAG.getUNDEF(VT); // FIXME: Should we return this as a BUILD_VECTOR instead? if ((ZeroMask | UndefMask).count() == NumElems) return VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); int FirstLoadedElt = LoadMask.find_first(); SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); LoadSDNode *LDBase = cast(EltBase); EVT LDBaseVT = EltBase.getValueType(); // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a // an additional shuffle stage to clear the ZERO elements. bool IsConsecutiveLoad = true; bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { SDValue Elt = peekThroughBitcasts(Elts[i]); LoadSDNode *LD = cast(Elt); if (!DAG.areNonVolatileConsecutiveLoads( LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8, i - FirstLoadedElt)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; } } else if (ZeroMask[i]) { IsConsecutiveLoad = false; } } SmallVector Loads; for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i) if (LoadMask[i]) Loads.push_back(cast(peekThroughBitcasts(Elts[i]))); auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(!(MMOFlags & MachineMemOperand::MOVolatile) && "Cannot merge volatile loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); for (auto *LD : Loads) DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; // LOAD - all consecutive load/undefs (must start/end with a load). // If we have found an entire vector of loads and undefs, then return a large // load of the entire vector width starting at the base pointer. // If the vector contains zeros, then attempt to shuffle those elements. if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { assert(LDBase && "Did not find base load for merging consecutive loads"); EVT EltVT = LDBase->getValueType(0); // Ensure that the input vector size for the merged loads matches the // cumulative size of the input elements. if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) return SDValue(); if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); // Don't create 256-bit non-temporal aligned loads without AVX2 as these // will lower to regular temporal loads and use the cache. if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 && VT.is256BitVector() && !Subtarget.hasInt256()) return SDValue(); if (IsConsecutiveLoad) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) { SmallVector ClearMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { if (ZeroMask[i]) ClearMask[i] = i + NumElems; else if (LoadMask[i]) ClearMask[i] = i; } SDValue V = CreateLoad(VT, LDBase); SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); } } int LoadSize = (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && (LoadSize == 32 || LoadSize == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize) : MVT::getIntegerVT(LoadSize); MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize); if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(), LDBase->getAlignment(), MachineMemOperand::MOLoad); for (auto *LD : Loads) DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } return SDValue(); } static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); unsigned NumElm = SplatBitSize / ScalarSize; SmallVector ConstantVec; for (unsigned i = 0; i < NumElm; i++) { APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); Constant *Const; if (VT.isFloatingPoint()) { if (ScalarSize == 32) { Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); } else { assert(ScalarSize == 64 && "Unsupported floating point scalar size"); Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); } } else Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); ConstantVec.push_back(Const); } return ConstantVector::get(ArrayRef(ConstantVec)); } static bool isUseOfShuffle(SDNode *N) { for (auto *U : N->uses()) { if (isTargetShuffle(U->getOpcode())) return true; if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts return isUseOfShuffle(U); } return false; } // Check if the current node of build vector is a zero extended vector. // // If so, return the value extended. // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a. // // NumElt - return the number of zero extended identical values. // // EltType - return the type of the value include the zero extend. static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op, unsigned &NumElt, MVT &EltType) { SDValue ExtValue = Op->getOperand(0); unsigned NumElts = Op->getNumOperands(); unsigned Delta = NumElts; for (unsigned i = 1; i < NumElts; i++) { if (Op->getOperand(i) == ExtValue) { Delta = i; break; } if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i)))) return SDValue(); } if (!isPowerOf2_32(Delta) || Delta == 1) return SDValue(); for (unsigned i = Delta; i < NumElts; i++) { if (i % Delta == 0) { if (Op->getOperand(i) != ExtValue) return SDValue(); } else if (!(isNullConstant(Op->getOperand(i)) || Op->getOperand(i).isUndef())) return SDValue(); } unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits(); unsigned ExtVTSize = EltSize * Delta; EltType = MVT::getIntegerVT(ExtVTSize); NumElt = NumElts / Delta; return ExtValue; } /// Attempt to use the vbroadcast instruction to generate a splat value /// from a splat BUILD_VECTOR which uses: /// a. A single scalar load, or a constant. /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). /// /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. if (!Subtarget.hasAVX()) return SDValue(); MVT VT = BVOp->getSimpleValueType(0); SDLoc dl(BVOp); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); BitVector UndefElements; SDValue Ld = BVOp->getSplatValue(&UndefElements); // Attempt to use VBROADCASTM // From this paterrn: // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) // b. t1 = (build_vector t0 t0) // // Create (VBROADCASTM v2i1 X) if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) { MVT EltType = VT.getScalarType(); unsigned NumElts = VT.getVectorNumElements(); SDValue BOperand; SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType); if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) || (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND && Ld.getOperand(0).getOpcode() == ISD::BITCAST)) { if (ZeroExtended) BOperand = ZeroExtended.getOperand(0); else BOperand = Ld.getOperand(0).getOperand(0); if (BOperand.getValueType().isVector() && BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) { if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 || NumElts == 8)) || // for broadcastmb2q (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 || NumElts == 16))) { // for broadcastmw2d SDValue Brdcst = DAG.getNode(X86ISD::VBROADCASTM, dl, MVT::getVectorVT(EltType, NumElts), BOperand); return DAG.getBitcast(VT, Brdcst); } } } } // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { APInt SplatValue, Undef; unsigned SplatBitSize; bool HasUndef; // Check if this is a repeated constant pattern suitable for broadcasting. if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && SplatBitSize > VT.getScalarSizeInBits() && SplatBitSize < VT.getSizeInBits()) { // Avoid replacing with broadcast when it's a use of a shuffle // instruction to preserve the present custom lowering of shuffles. if (isUseOfShuffle(BVOp) || BVOp->hasOneUse()) return SDValue(); // replace BUILD_VECTOR with broadcast of the repeated constants. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); LLVMContext *Ctx = DAG.getContext(); MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); if (Subtarget.hasAVX()) { if (SplatBitSize <= 64 && Subtarget.hasAVX2() && !(SplatBitSize == 64 && Subtarget.is32Bit())) { // Splatted value can fit in one INTEGER constant in constant pool. // Load the constant and broadcast it. MVT CVT = MVT::getIntegerVT(SplatBitSize); Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize); Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, MVT::getVectorVT(CVT, Repeat), Ld); return DAG.getBitcast(VT, Brdcst); } else if (SplatBitSize == 32 || SplatBitSize == 64) { // Splatted value can fit in one FLOAT constant in constant pool. // Load the constant and broadcast it. // AVX have support for 32 and 64 bit broadcast for floats only. // No 64bit integer in 32bit subtarget. MVT CVT = MVT::getFloatingPointVT(SplatBitSize); // Lower the splat via APFloat directly, to avoid any conversion. Constant *C = SplatBitSize == 32 ? ConstantFP::get(*Ctx, APFloat(APFloat::IEEEsingle(), SplatValue)) : ConstantFP::get(*Ctx, APFloat(APFloat::IEEEdouble(), SplatValue)); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, MVT::getVectorVT(CVT, Repeat), Ld); return DAG.getBitcast(VT, Brdcst); } else if (SplatBitSize > 64) { // Load the vector of constants and broadcast it. MVT CVT = VT.getScalarType(); Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); SDValue VCP = DAG.getConstantPool(VecC, PVT); unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); unsigned Alignment = cast(VCP)->getAlignment(); Ld = DAG.getLoad( MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld); return DAG.getBitcast(VT, Brdcst); } } } return SDValue(); } bool ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); // When optimizing for size, generate up to 5 extra bytes for a broadcast // instruction to save 8 or more bytes of constant pool data. // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast(Ld)) C = CF->getConstantFPValue(); assert(C && "Invalid constant type"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget.hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (Subtarget.hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) { if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } // Unsupported broadcast. return SDValue(); } /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real /// underlying vector and index. /// /// Modifies \p ExtractedFromVec to the real vector and returns the real /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { int Idx = cast(ExtIdx)->getZExtValue(); if (!isa(ExtractedFromVec)) return Idx; // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: // (extract_vector_elt (v8f32 %1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> // (extract_subvector (v8f32 %0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index // is 2, as specified by the shuffle. ShuffleVectorSDNode *SVOp = cast(ExtractedFromVec); SDValue ShuffleVec = SVOp->getOperand(0); MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); assert(ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()); int ShuffleIdx = SVOp->getMaskElt(Idx); if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { ExtractedFromVec = ShuffleVec; return ShuffleIdx; } return Idx; } static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); SDLoc DL(Op); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; SDValue VecIn2; SmallVector InsertIndices; SmallVector Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); if (Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { // Quit if more than 1 elements need inserting. if (InsertIndices.size() > 1) return SDValue(); InsertIndices.push_back(i); continue; } SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); // Quit if non-constant index. if (!isa(ExtIdx)) return SDValue(); int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); for (unsigned Idx : InsertIndices) NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx, DL)); return NV; } static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"); uint64_t Immediate = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (!In.isUndef()) Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); return DAG.getConstant(Immediate, dl, VT); } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. SDValue X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) return Op; if (ISD::isBuildVectorAllOnes(Op.getNode())) return Op; if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { // Split the pieces. SDValue Lower = DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32)); SDValue Upper = DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32)); // We have to manually lower both halves so getNode doesn't try to // reassemble the build_vector. Lower = LowerBUILD_VECTORvXi1(Lower, DAG); Upper = LowerBUILD_VECTORvXi1(Upper, DAG); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper); } SDValue Imm = ConvertI1VectorToInteger(Op, DAG); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, Imm); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } // Vector has one or more non-const elements uint64_t Immediate = 0; SmallVector NonConstIdx; bool IsSplat = true; bool HasConstElts = false; int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.isUndef()) continue; if (!isa(In)) NonConstIdx.push_back(idx); else { Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; HasConstElts = true; } if (SplatIdx < 0) SplatIdx = idx; else if (In != Op.getOperand(SplatIdx)) IsSplat = false; } // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx), DAG.getConstant(1, dl, VT), DAG.getConstant(0, dl, VT)); // insert elements one by one SDValue DstVec; SDValue Imm; if (Immediate) { MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); Imm = DAG.getConstant(Immediate, dl, ImmVT); } else if (HasConstElts) Imm = DAG.getConstant(0, dl, VT); else Imm = DAG.getUNDEF(VT); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) DstVec = DAG.getBitcast(VT, Imm); else { SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) { unsigned InsertIdx = NonConstIdx[i]; DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(InsertIdx), DAG.getIntPtrConstant(InsertIdx, dl)); } return DstVec; } /// \brief Return true if \p N implements a horizontal binop and return the /// operands for the horizontal binop into V0 and V1. /// /// This is a helper function of LowerToHorizontalOp(). /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal /// operation to match. /// For example, if \p Opcode is equal to ISD::ADD, then this function /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode /// is equal to ISD::SUB, then this function checks if this is a horizontal /// arithmetic sub. /// /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. if (Op->isUndef()) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; ExpectedVExtractIdx += 2; continue; } CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) break; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0) == Op1.getOperand(0) && isa(Op0.getOperand(1)) && isa(Op1.getOperand(1))); if (!CanFold) break; unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast(Op1.getOperand(1))->getZExtValue(); if (i * 2 < NumElts) { if (V0.isUndef()) { V0 = Op0.getOperand(0); if (V0.getValueType() != VT) return false; } } else { if (V1.isUndef()) { V1 = Op0.getOperand(0); if (V1.getValueType() != VT) return false; } if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; if (I0 == ExpectedVExtractIdx) CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; else if (IsCommutable && I1 == ExpectedVExtractIdx) { // Try to match the following dag sequence: // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; } else CanFold = false; ExpectedVExtractIdx += 2; } return CanFold; } /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// /// This is a helper function of LowerToHorizontalOp(). /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two /// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to /// the two new horizontal binop. /// When Mode is set, the first horizontal binop dag node would take as input /// the lower 128-bit of V0 and the upper 128-bit of V0. The second /// horizontal binop dag node would take as input the lower 128-bit of V1 /// and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V0_HI /// HADD V1_LO, V1_HI /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI /// /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { MVT VT = V0.getSimpleValueType(); assert(VT.is256BitVector() && VT == V1.getSimpleValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL); SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL); SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL); SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL); MVT NewVT = V0_LO.getSimpleValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && !V0->isUndef()) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); if (!isUndefHI && !V1->isUndef()) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef())) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef())) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } /// Returns true iff \p BV builds a vector with the result equivalent to /// the result of ADDSUB operation. /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation /// are written to the parameters \p Opnd0 and \p Opnd1. static bool isAddSub(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts) { MVT VT = BV->getSimpleValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) return false; unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); NumExtracts = 0; // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from // subtracting two integer/float elements. unsigned ExpectedOpcode = ISD::FSUB; unsigned NextExpectedOpcode = ISD::FADD; bool AddFound = false; bool SubFound = false; for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) { std::swap(ExpectedOpcode, NextExpectedOpcode); continue; } // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) return false; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) // Early exit if we cannot match that sequence. if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op0.getOperand(1)) || !isa(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) return false; unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); if (I0 != i) return false; // We found a valid add/sub node. Update the information accordingly. if (i & 1) AddFound = true; else SubFound = true; // Update InVec0 and InVec1. if (InVec0.isUndef()) { InVec0 = Op0.getOperand(0); if (InVec0.getSimpleValueType() != VT) return false; } if (InVec1.isUndef()) { InVec1 = Op1.getOperand(0); if (InVec1.getSimpleValueType() != VT) return false; } // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (ExpectedOpcode == ISD::FSUB) return false; // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) return false; } if (InVec1 != Op1.getOperand(0)) return false; // Update the pair of expected opcodes. std::swap(ExpectedOpcode, NextExpectedOpcode); // Increment the number of extractions done. ++NumExtracts; } // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef()) return false; Opnd0 = InVec0; Opnd1 = InVec1; return true; } /// Returns true if is possible to fold MUL and an idiom that has already been /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. /// /// Prior to calling this function it should be known that there is some /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called /// before replacement of such SDNode with ADDSUB operation. Thus the number /// of \p Opnd0 uses is expected to be equal to 2. /// For example, this function may be called for the following IR: /// %AB = fmul fast <2 x double> %A, %B /// %Sub = fsub fast <2 x double> %AB, %C /// %Add = fadd fast <2 x double> %AB, %C /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, /// <2 x i32> /// There is a def for %Addsub here, which potentially can be replaced by /// X86ISD::ADDSUB operation: /// %Addsub = X86ISD::ADDSUB %AB, %C /// and such ADDSUB can further be replaced with FMADDSUB: /// %Addsub = FMADDSUB %A, %B, %C. /// /// The main reason why this method is called before the replacement of the /// recognized ADDSUB idiom with ADDSUB operation is that such replacement /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses) { if (Opnd0.getOpcode() != ISD::FMUL || !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; // FIXME: These checks must match the similar ones in // DAGCombiner::visitFADDForFMACombine. It would be good to have one // function that would answer if it is Ok to fuse MUL + ADD to FMADD // or MUL + ADDSUB to FMADDSUB. const TargetOptions &Options = DAG.getTarget().Options; bool AllowFusion = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); if (!AllowFusion) return false; Opnd2 = Opnd1; Opnd1 = Opnd0.getOperand(1); Opnd0 = Opnd0.getOperand(0); return true; } /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node. static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; unsigned NumExtracts; if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts)) return SDValue(); MVT VT = BV->getSimpleValueType(0); SDLoc DL(BV); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; // TODO: According to coverage reports, the FMADDSUB transform is not // triggered by any tests. if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); // Do not generate X86ISD::ADDSUB node for 512-bit types even though // the ADDSUB idiom has been successfully recognized. There are no known // X86 targets with 512-bit ADDSUB instructions! // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom // recognition. if (VT.is512BitVector()) return SDValue(); return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = BV->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; unsigned Half = NumElts/2; // Count the number of UNDEF operands in the build_vector in input. for (unsigned i = 0, e = Half; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsHI++; // Early exit if this is either a build_vector of all UNDEFs or all the // operands but one are UNDEF. if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) return SDValue(); SDLoc DL(BV); SDValue InVec0, InVec1; if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } if (!Subtarget.hasAVX()) return SDValue(); if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { // Try to match an AVX horizontal add/sub of packed single/double // precision floating point values from 256-bit vectors. SDValue InVec2, InVec3; if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; if (CanFold) { // Fold this build_vector into a single horizontal add/sub. // Do this only if the target has AVX2. if (Subtarget.hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into a pair of horizontal binop followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget.hasAVX()) { unsigned X86Opcode; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); // Don't try to expand this build_vector into a pair of horizontal add/sub // if we can simply emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into two horizontal add/sub followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, isUndefLO, isUndefHI); } return SDValue(); } /// If a BUILD_VECTOR's source elements all apply the same bit operation and /// one of their operands is constant, lower to a pair of BUILD_VECTOR and /// just apply the bit to the vectors. /// NOTE: Its not in our interest to start make a general purpose vectorizer /// from this, but enough scalar bit operations are created from the later /// legalization + scalarization stages to need basic support. static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Check that all elements have the same opcode. // TODO: Should we allow UNDEFS and if so how many? unsigned Opcode = Op->getOperand(0).getOpcode(); for (unsigned i = 1; i < NumElems; ++i) if (Opcode != Op->getOperand(i).getOpcode()) return SDValue(); // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). switch (Opcode) { default: return SDValue(); case ISD::AND: case ISD::XOR: case ISD::OR: if (!TLI.isOperationLegalOrPromote(Opcode, VT)) return SDValue(); break; } SmallVector LHSElts, RHSElts; for (SDValue Elt : Op->ops()) { SDValue LHS = Elt.getOperand(0); SDValue RHS = Elt.getOperand(1); // We expect the canonicalized RHS operand to be the constant. if (!isa(RHS)) return SDValue(); LHSElts.push_back(LHS); RHSElts.push_back(RHS); } SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); return DAG.getNode(Opcode, DL, VT, LHS, RHS); } /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); // Vectors containing all zeros can be matched by pxor and xorps. if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getZeroVector(VT, Subtarget, DAG, DL); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || VT == MVT::v16i32 || (VT == MVT::v8i32 && Subtarget.hasInt256())) return Op; return getOnesVector(VT, DAG, DL); } return SDValue(); } // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be // reasoned to be a permutation of a vector by indices in a non-constant vector. // (build_vector (extract_elt V, (extract_elt I, 0)), // (extract_elt V, (extract_elt I, 1)), // ... // -> // (vpermv I, V) // // TODO: Handle undefs // TODO: Utilize pshufb and zero mask blending to support more efficient // construction of vectors with constant-0 elements. // TODO: Use smaller-element vectors of same width, and "interpolate" the indices, // when no native operation available. static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Look for VPERMV and PSHUFB opportunities. MVT VT = V.getSimpleValueType(); switch (VT.SimpleTy) { default: return SDValue(); case MVT::v16i8: if (!Subtarget.hasSSE3()) return SDValue(); break; case MVT::v8f32: case MVT::v8i32: if (!Subtarget.hasAVX2()) return SDValue(); break; case MVT::v4i64: case MVT::v4f64: if (!Subtarget.hasVLX()) return SDValue(); break; case MVT::v16f32: case MVT::v8f64: case MVT::v16i32: case MVT::v8i64: if (!Subtarget.hasAVX512()) return SDValue(); break; case MVT::v32i16: if (!Subtarget.hasBWI()) return SDValue(); break; case MVT::v8i16: case MVT::v16i16: if (!Subtarget.hasVLX() || !Subtarget.hasBWI()) return SDValue(); break; case MVT::v64i8: if (!Subtarget.hasVBMI()) return SDValue(); break; case MVT::v32i8: if (!Subtarget.hasVLX() || !Subtarget.hasVBMI()) return SDValue(); break; } SDValue SrcVec, IndicesVec; // Check for a match of the permute source vector and permute index elements. // This is done by checking that the i-th build_vector operand is of the form: // (extract_elt SrcVec, (extract_elt IndicesVec, i)). for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { SDValue Op = V.getOperand(Idx); if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract encountered in V, set the source vector, // otherwise verify the extract is from the previously defined source // vector. if (!SrcVec) SrcVec = Op.getOperand(0); else if (SrcVec != Op.getOperand(0)) return SDValue(); SDValue ExtractedIndex = Op->getOperand(1); // Peek through extends. if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) ExtractedIndex = ExtractedIndex.getOperand(0); if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract from the index vector candidate, set the // indices vector, otherwise verify the extract is from the previously // defined indices vector. if (!IndicesVec) IndicesVec = ExtractedIndex.getOperand(0); else if (IndicesVec != ExtractedIndex.getOperand(0)) return SDValue(); auto *PermIdx = dyn_cast(ExtractedIndex.getOperand(1)); if (!PermIdx || PermIdx->getZExtValue() != Idx) return SDValue(); } MVT IndicesVT = VT; if (VT.isFloatingPoint()) IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()), VT.getVectorNumElements()); IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG); if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) return VectorConstant; BuildVectorSDNode *BV = cast(Op.getNode()); // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB // transform here. if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) return HorizontalOp; if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) return Broadcast; if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG)) return BitOp; unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet Values; unsigned NumConstants = NumElems; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.isUndef()) continue; Values.insert(Elt); if (!isa(Elt) && !isa(Elt)) { IsAllConstants = false; NumConstants--; } if (X86::isZeroNode(Elt)) NumZero++; else { assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. NonZeros |= ((uint64_t)1 << i); NumNonZero++; } } // All undef vector. Return an UNDEF. All zero vectors were handled above. if (NumNonZero == 0) return DAG.getUNDEF(VT); // If we are inserting one variable into a vector of non-zero constants, try // to avoid loading each constant element as a scalar. Load the constants as a // vector and then insert the variable scalar element. If insertion is not // supported, we assume that we will fall back to a shuffle to get the scalar // blended with the constants. Insertion into a zero vector is handled as a // special-case somewhere below here. LLVMContext &Context = *DAG.getContext(); if (NumConstants == NumElems - 1 && NumNonZero != 1 && (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { // Create an all-constant vector. The variable element in the old // build vector is replaced by undef in the constant vector. Save the // variable scalar element and its index for use in the insertelement. Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context); SmallVector ConstVecOps(NumElems, UndefValue::get(EltType)); SDValue VarElt; SDValue InsIndex; for (unsigned i = 0; i != NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue()); else if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF()); else if (!Elt.isUndef()) { assert(!VarElt.getNode() && !InsIndex.getNode() && "Expected one variable element in this vector"); VarElt = Elt; InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout())); } } Constant *CV = ConstantVector::get(ConstVecOps); SDValue DAGConstVec = DAG.getConstantPool(CV, VT); // The constants we just created may not be legal (eg, floating point). We // must lower the vector right here because we can not guarantee that we'll // legalize it before loading it. This is also why we could not just create // a new build vector here. If the build vector contains illegal constants, // it could get split back up into a series of insert elements. // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD. SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG); MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); } // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); // If this is an insertion of an i64 value on x86-32, and if the top bits of // the value are obviously zero, truncate the value to i32 and do the // insertion that way. Only do this if the value is non-constant or if the // value is a constant being inserted into element 0. It is cheaper to do // a constant pool load than it is to do a movd + shuffle. if (ExtVT == MVT::i64 && !Subtarget.is64Bit() && (!IsAllConstants || Idx == 0)) { if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); MVT VecVT = MVT::v4i32; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef( Item, Idx * 2, true, Subtarget, DAG)); } } // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget.is64Bit())) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } // We can't directly insert an i8 or i16 into a vector, so zero extend // it to i32 first. if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); if (VT.getSizeInBits() >= 256) { MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); if (Subtarget.hasAVX()) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } else { // Without AVX, we need to extend to a 128-bit vector and then // insert into the 256-bit vector. Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl); Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl); } } else { assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getBitcast(VT, Item); } } // Is it a vector logical left shift? if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), NumBits/2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); // Otherwise, if this is a vector with i32 or f32 elements, and the element // is a non-constant being inserted into an element other than the low one, // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) { if (EVTBits == 32) { // Instead of a shuffle like this: // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); } return SDValue(); } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) return SDValue(); if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget)) return V; // See if we can use a vector load to get all of the elements. if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) { SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElems); if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) return LD; } // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2); // Build both the lower and upper subvector. SDValue Lower = DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2)); SDValue Upper = DAG.getBuildVector( HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); // Recreate the wider vector with the lower and upper part. if (VT.is256BitVector()) return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); } // Let legalizer expand 2-wide build_vectors. if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget)) return V; if (EVTBits == 16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. if (NumElems == 4 && NumZero > 0) { SmallVector Ops(NumElems); for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1ULL << i)); if (isZero) Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); else Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch ((NonZeros >> (i*2)) & 0x3) { default: llvm_unreachable("Unexpected NonZero count"); case 0: Ops[i] = Ops[i*2]; // Must be a zero vector. break; case 1: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); break; case 2: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; case 3: Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; } } bool Reverse1 = (NonZeros & 0x3) == 2; bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, static_cast(Reverse2 ? NumElems+1 : NumElems), static_cast(Reverse2 ? NumElems : NumElems+1) }; return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); } assert(Values.size() > 1 && "Expected non-undef and non-splat vector"); // Check for a build vector from mostly shuffle plus few inserting. if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. if (Subtarget.hasSSE41()) { SDValue Result; if (!Op.getOperand(0).isUndef()) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).isUndef()) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return Result; } // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). SmallVector Ops(NumElems); for (unsigned i = 0; i < NumElems; ++i) { if (!Op.getOperand(i).isUndef()) Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else Ops[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 1 ==> X: // : unpcklps 2, 3 ==> Y: // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { // Generate scaled UNPCKL shuffle mask. SmallVector Mask; for(unsigned i = 0; i != Scale; ++i) Mask.push_back(i); for (unsigned i = 0; i != Scale; ++i) Mask.push_back(NumElems+i); Mask.append(NumElems - Mask.size(), SM_SentinelUndef); for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); } return Ops[0]; } // 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); if (ResVT.is256BitVector()) return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SDValue V3 = Op.getOperand(2); SDValue V4 = Op.getOperand(3); return concat256BitVectors( concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl), concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT, NumElems, DAG, dl); } return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } // Return true if all the operands of the given CONCAT_VECTORS node are zeros // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0) static bool isExpandWithZeros(const SDValue &Op) { assert(Op.getOpcode() == ISD::CONCAT_VECTORS && "Expand with zeros only possible in CONCAT_VECTORS nodes!"); for (unsigned i = 1; i < Op.getNumOperands(); i++) if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode())) return false; return true; } // Returns true if the given node is a type promotion (by concatenating i1 // zeros) of the result of a node that already zeros all upper bits of // k-register. static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) { unsigned Opc = Op.getOpcode(); assert(Opc == ISD::CONCAT_VECTORS && Op.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected node to check for type promotion!"); // As long as we are concatenating zeros to the upper part of a previous node // result, climb up the tree until a node with different opcode is // encountered while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) { if (Opc == ISD::INSERT_SUBVECTOR) { if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) && Op.getConstantOperandVal(2) == 0) Op = Op.getOperand(1); else return SDValue(); } else { // Opc == ISD::CONCAT_VECTORS if (isExpandWithZeros(Op)) Op = Op.getOperand(0); else return SDValue(); } Opc = Op.getOpcode(); } // Check if the first inserted node zeroes the upper bits, or an 'and' result // of a node that zeros the upper bits (its masked version). if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) || (Op.getOpcode() == ISD::AND && (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) || isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) { return Op; } return SDValue(); } static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG & DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); unsigned NumOperands = Op.getNumOperands(); assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); // If this node promotes - by concatenating zeroes - the type of the result // of a node with instruction that zeroes all upper (irrelevant) bits of the // output register, mark it as legal and catch the pattern in instruction // selection to avoid emitting extra instructions (for zeroing upper bits). if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) { SDValue ZeroC = DAG.getIntPtrConstant(0, dl); SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted, ZeroC); } unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; if (ISD::isBuildVectorAllZeros(SubVec.getNode())) ++NumZero; else { assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. NonZeros |= (uint64_t)1 << i; ++NumNonZero; } } // If there are zero or one non-zeros we can handle this very simply. if (NumNonZero <= 1) { SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) : DAG.getUNDEF(ResVT); if (!NumNonZero) return Vec; unsigned Idx = countTrailingZeros(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, DAG.getIntPtrConstant(Idx * SubVecNumElts, dl)); } if (NumOperands > 2) { MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(NumOperands/2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } assert(NumNonZero == 2 && "Simple cases not handled?"); if (ResVT.getVectorNumElements() >= 16) return Op; // The operation is legal with KUNPCK SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT), Op.getOperand(0), DAG.getIntPtrConstant(0, dl)); unsigned NumElems = ResVT.getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), DAG.getIntPtrConstant(NumElems/2, dl)); } static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT.getVectorElementType() == MVT::i1) return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); // AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors return LowerAVXCONCAT_VECTORS(Op, DAG); } //===----------------------------------------------------------------------===// // Vector shuffle lowering // // This is an experimental code path for lowering vector shuffles on x86. It is // designed to handle arbitrary vector shuffles and blends, gracefully // degrading performance as necessary. It works hard to recognize idiomatic // shuffles and lower them to optimal instruction patterns without leaving // a framework that allows reasonably efficient handling of all vector shuffle // patterns. //===----------------------------------------------------------------------===// /// \brief Tiny helper function to identify a no-op mask. /// /// This is a somewhat boring predicate function. It checks whether the mask /// array input, which is assumed to be a single-input shuffle mask of the kind /// used by the X86 shuffle instructions (not a fully general /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef Mask) { for (int i = 0, Size = Mask.size(); i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] >= 0 && Mask[i] != i) return false; } return true; } /// \brief Test whether there are elements crossing 128-bit lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { int LaneSize = 128 / VT.getScalarSizeInBits(); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) return true; return false; } /// \brief Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same /// lane-relative shuffle in each sub-lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is /// non-trivial to compute in the face of undef lanes. The representation is /// suitable for use with existing 128-bit shuffles as entries from the second /// vector have been remapped to [LaneSize, 2*LaneSize). static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0); if (Mask[i] < 0) continue; if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. // Adjust second vector indices to start at LaneSize instead of Size. int LocalM = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; if (RepeatedMask[i % LaneSize] < 0) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// Test whether a shuffle mask is equivalent within each 128-bit lane. static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } /// Test whether a shuffle mask is equivalent within each 256-bit lane. static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); } /// Test whether a target shuffle mask is equivalent within each sub-lane. /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, SM_SentinelUndef); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0)); if (Mask[i] == SM_SentinelUndef) continue; if (Mask[i] == SM_SentinelZero) { if (!isUndefOrZero(RepeatedMask[i % LaneSize])) return false; RepeatedMask[i % LaneSize] = SM_SentinelZero; continue; } if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. // Adjust second vector indices to start at LaneSize instead of Size. int LocalM = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef Mask, ArrayRef ExpectedMask) { if (Mask.size() != ExpectedMask.size()) return false; int Size = Mask.size(); // If the values are build vectors, we can look through them to find // equivalent inputs that make the shuffles equivalent. auto *BV1 = dyn_cast(V1); auto *BV2 = dyn_cast(V2); for (int i = 0; i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) { auto *MaskBV = Mask[i] < Size ? BV1 : BV2; auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; if (!MaskBV || !ExpectedBV || MaskBV->getOperand(Mask[i] % Size) != ExpectedBV->getOperand(ExpectedMask[i] % Size)) return false; } } return true; } /// Checks whether a target shuffle mask is equivalent to an explicit pattern. /// /// The masks must be exactly the same width. /// /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding /// value in ExpectedMask is always accepted. Otherwise the indices must match. /// /// SM_SentinelZero is accepted as a valid negative index but must match in both. static bool isTargetShuffleEquivalent(ArrayRef Mask, ArrayRef ExpectedMask) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; for (int i = 0; i < Size; ++i) if (Mask[i] == SM_SentinelUndef) continue; else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero) return false; else if (Mask[i] != ExpectedMask[i]) return false; return true; } // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle // mask. static SmallVector createTargetShuffleMask(ArrayRef Mask, const APInt &Zeroable) { int NumElts = Mask.size(); assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes"); SmallVector TargetMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index"); TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M); } return TargetMask; } // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd // instructions. static bool isUnpackWdShuffleMask(ArrayRef Mask, MVT VT) { if (VT != MVT::v8i32 && VT != MVT::v8f32) return false; SmallVector Unpcklwd; createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, /* Unary = */ false); SmallVector Unpckhwd; createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, /* Unary = */ false); bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) || isTargetShuffleEquivalent(Mask, Unpckhwd)); return IsUnpackwdMask; } /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. static unsigned getV4X86ShuffleImm(ArrayRef Mask) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); unsigned Imm = 0; Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; return Imm; } static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } /// \brief Compute whether each element of a shuffle is zeroable. /// /// A "zeroable" vector shuffle element is one which can be lowered to zero. /// Either it is an undef element in the shuffle mask, the element of the input /// referenced is undef, or the element of the input referenced is known to be /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. static APInt computeZeroableShuffleElements(ArrayRef Mask, SDValue V1, SDValue V2) { APInt Zeroable(Mask.size(), 0); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); int VectorSizeInBits = V1.getValueSizeInBits(); int ScalarSizeInBits = VectorSizeInBits / Mask.size(); assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { Zeroable.setBit(i); continue; } // Determine shuffle input and normalize the mask. SDValue V = M < Size ? V1 : V2; M %= Size; // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. if (V.getOpcode() != ISD::BUILD_VECTOR) continue; // If the BUILD_VECTOR has fewer elements then the bitcasted portion of // the (larger) source element must be UNDEF/ZERO. if ((Size % V.getNumOperands()) == 0) { int Scale = Size / V->getNumOperands(); SDValue Op = V.getOperand(M / Scale); if (Op.isUndef() || X86::isZeroNode(Op)) Zeroable.setBit(i); else if (ConstantSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getAPIntValue(); Val.lshrInPlace((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); if (Val == 0) Zeroable.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); Val.lshrInPlace((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); if (Val == 0) Zeroable.setBit(i); } continue; } // If the BUILD_VECTOR has more elements then all the (smaller) source // elements must be UNDEF or ZERO. if ((V.getNumOperands() % Size) == 0) { int Scale = V->getNumOperands() / Size; bool AllZeroable = true; for (int j = 0; j < Scale; ++j) { SDValue Op = V.getOperand((M * Scale) + j); AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); } if (AllZeroable) Zeroable.setBit(i); continue; } } return Zeroable; } // The Shuffle result is as follow: // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. // Each Zeroable's element correspond to a particular Mask's element. // As described in computeZeroableShuffleElements function. // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef Mask, const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. for (int i = 0, e = Mask.size(); i < e; i++) { // Checks if the mask's zeros elements are built from only zeros. assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] < 0) return false; if (Zeroable[i]) continue; // Find the lowest non zero element if (NextElement < 0) { NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; IsZeroSideLeft = NextElement != 0; } // Exit if the mask's non zero elements are not in increasing order. if (NextElement != Mask[i]) return false; NextElement++; } return true; } /// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); const int NumBytes = VT.getSizeInBits() / 8; const int NumEltBytes = VT.getScalarSizeInBits() / 8; assert((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())); SmallVector PSHUFBMask(NumBytes); // Sign bit set in i8 mask means zero element. SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); SDValue V; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / NumEltBytes]; if (M < 0) { PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); continue; } if (Zeroable[i / NumEltBytes]) { PSHUFBMask[i] = ZeroMask; continue; } // We can only use a single input of V1 or V2. SDValue SrcV = (M >= Size ? V2 : V1); if (V && V != SrcV) return SDValue(); V = SrcV; M %= Size; // PSHUFB can't cross lanes, ensure this doesn't happen. if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) return SDValue(); M = M % LaneSize; M = M * NumEltBytes + (i % NumEltBytes); PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); } assert(V && "Failed to find a source input"); MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V), DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl); // X86 has dedicated shuffle that can be lowered to VEXPAND static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsLeftZeroSide = true; if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) return SDValue(); unsigned VEXPANDMask = (~Zeroable).getZExtValue(); MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); unsigned NumElts = VT.getVectorNumElements(); assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"); SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; return DAG.getSelect(DL, VT, VMask, DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), ZeroVector); } static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef TargetMask, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; for (int i = 0; i != NumElts; i += 2) { int M1 = TargetMask[i + 0]; int M2 = TargetMask[i + 1]; Undef1 &= (SM_SentinelUndef == M1); Undef2 &= (SM_SentinelUndef == M2); Zero1 &= isUndefOrZero(M1); Zero2 &= isUndefOrZero(M2); } assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) && "Zeroable shuffle detected"); // Attempt to match the target mask against the unpack lo/hi mask patterns. SmallVector Unpckl, Unpckh; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { UnpackOpcode = X86ISD::UNPCKL; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { UnpackOpcode = X86ISD::UNPCKH; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } // If an unary shuffle, attempt to match as an unpack lo/hi with zero. if (IsUnary && (Zero1 || Zero2)) { // Don't bother if we can blend instead. if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) return false; bool MatchLo = true, MatchHi = true; for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { int M = TargetMask[i]; // Ignore if the input is known to be zero or the index is undef. if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || (M == SM_SentinelUndef)) continue; MatchLo &= (M == Unpckl[i]); MatchHi &= (M == Unpckh[i]); } if (MatchLo || MatchHi) { UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; return true; } } // If a binary shuffle, commute and try again. if (!IsUnary) { ShuffleVectorSDNode::commuteMask(Unpckl); if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { UnpackOpcode = X86ISD::UNPCKL; std::swap(V1, V2); return true; } ShuffleVectorSDNode::commuteMask(Unpckh); if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { UnpackOpcode = X86ISD::UNPCKH; std::swap(V1, V2); return true; } } return false; } // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SmallVector Unpckl; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); SmallVector Unpckh; createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); // Commute and try again. ShuffleVectorSDNode::commuteMask(Unpckl); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); ShuffleVectorSDNode::commuteMask(Unpckh); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); return SDValue(); } // X86 has dedicated pack instructions that can handle specific truncation // operations: PACKSS and PACKUS. static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef TargetMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NumElts = VT.getVectorNumElements(); unsigned BitSize = VT.getScalarSizeInBits(); MVT PackSVT = MVT::getIntegerVT(BitSize * 2); MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2); auto MatchPACK = [&](SDValue N1, SDValue N2) { SDValue VV1 = DAG.getBitcast(PackVT, N1); SDValue VV2 = DAG.getBitcast(PackVT, N2); if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) && (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) { V1 = VV1; V2 = VV2; SrcVT = PackVT; PackOpcode = X86ISD::PACKSS; return true; } if (Subtarget.hasSSE41() || PackSVT == MVT::i16) { APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize); if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) && (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) { V1 = VV1; V2 = VV2; SrcVT = PackVT; PackOpcode = X86ISD::PACKUS; return true; } } return false; }; // Try binary shuffle. SmallVector BinaryMask; createPackShuffleMask(VT, BinaryMask, false); if (isTargetShuffleEquivalent(TargetMask, BinaryMask)) if (MatchPACK(V1, V2)) return true; // Try unary shuffle. SmallVector UnaryMask; createPackShuffleMask(VT, UnaryMask, true); if (isTargetShuffleEquivalent(TargetMask, UnaryMask)) if (MatchPACK(V1, V1)) return true; return false; } static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, Subtarget)) return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1), DAG.getBitcast(PackVT, V2)); return SDValue(); } /// \brief Try to emit a bitmask instruction for a shuffle. /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "Floating point types are not supported"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Zeroable[i]) continue; if (Mask[i] % Size != i) return SDValue(); // Not a blend. if (!V) V = Mask[i] < Size ? V1 : V2; else if (V != (Mask[i] < Size ? V1 : V2)) return SDValue(); // Can only let one input through the mask. VMaskOps[i] = AllOnes; } if (!V) return SDValue(); // No non-zeroable elements! SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps); return DAG.getNode(ISD::AND, DL, VT, V, VMask); } /// \brief Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) return SDValue(); // Shuffled input! MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); } SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); // We have to cast V2 around. MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT, DAG.getBitcast(MaskVT, V1Mask), DAG.getBitcast(MaskVT, V2))); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG); static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, MutableArrayRef TargetMask, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); BlendMask = 0; ForceV1Zero = false, ForceV2Zero = false; assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask"); // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { int M = TargetMask[i]; if (M == SM_SentinelUndef) continue; if (M == i) continue; if (M == i + Size) { BlendMask |= 1ull << i; continue; } if (M == SM_SentinelZero) { if (V1IsZeroOrUndef) { ForceV1Zero = true; TargetMask[i] = i; continue; } if (V2IsZeroOrUndef) { ForceV2Zero = true; BlendMask |= 1ull << i; TargetMask[i] = i + Size; continue; } } return false; } return true; } static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) { uint64_t ScaledMask = 0; for (int i = 0; i != Size; ++i) if (BlendMask & (1ull << i)) ScaledMask |= ((1ull << Scale) - 1) << (i * Scale); return ScaledMask; } /// \brief Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector Mask = createTargetShuffleMask(Original, Zeroable); uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero, BlendMask)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) V1 = getZeroVector(VT, Subtarget, DAG, DL); if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: case MVT::v4f64: case MVT::v8f32: return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); LLVM_FALLTHROUGH; case MVT::v2i64: case MVT::v4i32: // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into // that instruction. if (Subtarget.hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8))); } LLVM_FALLTHROUGH; case MVT::v8i16: { // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8))); } case MVT::v16i16: { assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); } LLVM_FALLTHROUGH; } case MVT::v16i8: case MVT::v32i8: { assert((VT.is128BitVector() || Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!"); if (Subtarget.hasBWI() && Subtarget.hasVLX()) { MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) return Masked; // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; // This form of blend is always done on bytes. Compute the byte vector // type. MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code // generator that boolean values in the elements of an x86 vector register // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' // mapping a select to operand #1, and 'false' mapping to operand #2. The // reality in x86 is that vector masks (pre-AVX-512) use only the high bit // of the element (the remaining are ignored) and 0 in that high bit would // mean operand #1 while 1 in the high bit would mean operand #2. So while // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. SmallVector VSELECTMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) VSELECTMask.push_back( Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8)); V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( VT, DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } case MVT::v16f32: case MVT::v8f64: case MVT::v8i64: case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } default: llvm_unreachable("Not a supported integer vector type!"); } } /// \brief Try to lower as a blend of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector BlendMask(Mask.size(), -1); SmallVector PermuteMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); if (BlendMask[Mask[i] % Size] < 0) BlendMask[Mask[i] % Size] = Mask[i]; else if (BlendMask[Mask[i] % Size] != Mask[i]) return SDValue(); // Can't blend in the needed input! PermuteMask[i] = Mask[i] % Size; } SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } /// \brief Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and // blend them together. SmallVector V1Mask(Mask.size(), -1); SmallVector V2Mask(Mask.size(), -1); SmallVector BlendMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= 0 && Mask[i] < Size) { V1Mask[i] = Mask[i]; BlendMask[i] = i; } else if (Mask[i] >= Size) { V2Mask[i] = Mask[i] - Size; BlendMask[i] = i + Size; } // Try to lower with the simpler initial blend strategy unless one of the // input shuffles would be a no-op. We prefer to shuffle inputs as the // shuffle may be able to fold with a load or other benefit. However, when // we'll have to do 2x as many shuffles in order to achieve this, blending // first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } /// \brief Try to lower a vector shuffle as a rotation. /// /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef Mask) { int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] // [-1, -1, -1, -1, -1, -1, 1, 2] // [ 3, 4, 5, 6, 7, 8, 9, 10] // [-1, 4, 5, 6, -1, -1, 9, -1] // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; for (int i = 0; i < NumElts; ++i) { int M = Mask[i]; assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && "Unexpected mask index."); if (M < 0) continue; // Determine where a rotated vector would have started. int StartIdx = i - (M % NumElts); if (StartIdx == 0) // The identity rotation isn't interesting, stop. return -1; // If we found the tail of a vector the rotation must be the missing // front. If we found the head of a vector, it must be how much of the // head. int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx; if (Rotation == 0) Rotation = CandidateRotation; else if (Rotation != CandidateRotation) // The rotations don't match, so we can't match this mask. return -1; // Compute which value this mask is pointing at. SDValue MaskV = M < NumElts ? V1 : V2; // Compute which of the two target values this index should be assigned // to. This reflects whether the high elements are remaining or the low // elements are remaining. SDValue &TargetV = StartIdx < 0 ? Hi : Lo; // Either set up this value if we've not encountered it before, or check // that it remains consistent. if (!TargetV) TargetV = MaskV; else if (TargetV != MaskV) // This may be a rotation, but it pulls from the inputs in some // unsupported interleaving. return -1; } // Check that we successfully analyzed the mask, and normalize the results. assert(Rotation != 0 && "Failed to locate a viable rotation!"); assert((Lo || Hi) && "Failed to find a rotated input vector!"); if (!Lo) Lo = Hi; else if (!Hi) Hi = Lo; V1 = Lo; V2 = Hi; return Rotation; } /// \brief Try to lower a vector shuffle as a byte rotation. /// /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will /// try to generically lower a vector shuffle through such an pattern. It /// does not check for the profitability of lowering either as PALIGNR or /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. /// This matches shuffle vectors that look like: /// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask) { // Don't accept any shuffles with zero elements. if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return -1; // PALIGNR works on 128-bit lanes. SmallVector RepeatedMask; if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) return -1; int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask); if (Rotation <= 0) return -1; // PALIGNR rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. int NumElts = RepeatedMask.size(); int Scale = 16 / NumElts; return Rotation * Scale; } static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); SDValue Lo = V1, Hi = V2; int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask); if (ByteRotation <= 0) return SDValue(); // Cast the inputs to i8 vector of correct length to match PALIGNR or // PSLLDQ/PSRLDQ. MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); Lo = DAG.getBitcast(ByteVT, Lo); Hi = DAG.getBitcast(ByteVT, Hi); // SSSE3 targets can use the palignr instruction. if (Subtarget.hasSSSE3()) { assert((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, DAG.getConstant(ByteRotation, DL, MVT::i8))); } assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); assert(ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"); // Default SSE2 implementation int LoByteShift = 16 - ByteRotation; int HiByteShift = ByteRotation; SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, DAG.getConstant(LoByteShift, DL, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, DAG.getConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } /// \brief Try to lower a vector shuffle as a dword/qword rotation. /// /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary /// rotation of the concatenation of two vectors; This routine will /// try to generically lower a vector shuffle through such an pattern. /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); // 128/256-bit vectors are only supported with VLX. assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask); if (Rotation <= 0) return SDValue(); return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, DAG.getConstant(Rotation, DL, MVT::i8)); } /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function /// matches elements from one of the input vectors shuffled to the left or /// right with zeroable elements 'shifted in'. It handles both the strictly /// bit-wise element shifts and the byte shift across an entire 128-bit double /// quad word lane. /// /// PSHL : (little-endian) left bit shift. /// [ zz, 0, zz, 2 ] /// [ -1, 4, zz, -1 ] /// PSRL : (little-endian) right bit shift. /// [ 1, zz, 3, zz] /// [ -1, -1, 7, zz] /// PSLLDQ : (little-endian) left byte shift /// [ zz, 0, 1, 2, 3, 4, 5, 6] /// [ zz, zz, -1, -1, 2, 3, 4, -1] /// [ zz, zz, zz, zz, zz, zz, -1, 1] /// PSRLDQ : (little-endian) right byte shift /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; auto CheckZeros = [&](int Shift, int Scale, bool Left) { for (int i = 0; i < Size; i += Scale) for (int j = 0; j < Shift; ++j) if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, int Scale, bool Left) { for (int i = 0; i != Size; i += Scale) { unsigned Pos = Left ? i + Shift : i; unsigned Low = Left ? i : i + Shift; unsigned Len = Scale - Shift; if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset)) return -1; } int ShiftEltBits = ScalarSizeInBits * Scale; bool ByteShift = ShiftEltBits > 64; Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1); // Normalize the scale for byte shifts to still produce an i64 element // type. Scale = ByteShift ? Scale / 2 : Scale; // We need to round trip through the appropriate type for the shift. MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) : MVT::getVectorVT(ShiftSVT, Size / Scale); return (int)ShiftAmt; }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just // keep doubling the size of the integer elements up to that. We can // then shift the elements of the integer vector by whole multiples of // their width within the elements of the larger integer vector. Test each // multiple to see if we can find a match with the moved element indices // and that the shifted in elements are all zeroable. unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128); for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2) for (int Shift = 1; Shift != Scale; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Scale, Left)) { int ShiftAmt = MatchShift(Shift, Scale, Left); if (0 < ShiftAmt) return ShiftAmt; } // no match return -1; } static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); MVT ShiftVT; SDValue V = V1; unsigned Opcode; // Try to match shuffle against V1 shift. int ShiftAmt = matchVectorShuffleAsShift( ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); // If V1 failed, try to match shuffle against V2 shift. if (ShiftAmt < 0) { ShiftAmt = matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, Size, Zeroable, Subtarget); V = V2; } if (ShiftAmt < 0) return SDValue(); assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(Opcode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, DL, MVT::i8)); return DAG.getBitcast(VT, V); } // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) return false; // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); // Attempt to match first Len sequential elements from the lower half. SDValue Src; int Idx = -1; for (int i = 0; i != Len; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; SDValue &V = (M < Size ? V1 : V2); M = M % Size; // The extracted elements must start at a valid index and all mask // elements must be in the lower half. if (i > M || M >= HalfSize) return false; if (Idx < 0 || (Src == V && Idx == (M - i))) { Src = V; Idx = M - i; continue; } return false; } if (!Src || Idx < 0) return false; assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Src; return true; } // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) return false; for (int Idx = 0; Idx != HalfSize; ++Idx) { SDValue Base; // Attempt to match first source from mask before insertion point. if (isUndefInRange(Mask, 0, Idx)) { /* EMPTY */ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { Base = V1; } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { Base = V2; } else { continue; } // Extend the extraction length looking to match both the insertion of // the second source and the remaining elements of the first. for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { SDValue Insert; int Len = Hi - Idx; // Match insertion. if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { Insert = V1; } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { Insert = V2; } else { continue; } // Match the remaining elements of the lower half. if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ } else if ((!Base || (Base == V1)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; } else if ((!Base || (Base == V2)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Size + Hi)) { Base = V2; } else { continue; } BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Base; V2 = Insert; return true; } } return false; } /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { uint64_t BitLen, BitIdx; if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); return SDValue(); } /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. The extended elements are consecutive and /// begin and can start from an offsetted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int EltBits = VT.getScalarSizeInBits(); int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = 128 / EltBits; int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); assert(0 <= Offset && "Extension offset must be positive."); assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."); // Check that an index is in same lane as the base offset. auto SafeOffset = [&](int Idx) { return OffsetLane == (Idx / NumEltsPerLane); }; // Shift along an input so that the offset base moves to the first element. auto ShuffleOffset = [&](SDValue V) { if (!Offset) return V; SmallVector ShMask((unsigned)NumElements, -1); for (int i = 0; i * Scale < NumElements; ++i) { int SrcIdx = i + Offset; ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; } return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); }; // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {Offset / 2, -1, SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); assert(VT.is128BitVector() && "Unexpected vector width!"); int LoIdx = Offset * EltBits; SDValue Lo = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(LoIdx, DL, MVT::i8))); if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); int HiIdx = (Offset + 1) * EltBits; SDValue Hi = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(HiIdx, DL, MVT::i8))); return DAG.getBitcast(VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { int Idx = Offset + (i / Scale); PSHUFBMask[i] = DAG.getConstant( (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask))); } // If we are extending from an offset, ensure we start on a boundary that // we can unpack from. int AlignToUnpack = Offset % (NumElements / Scale); if (AlignToUnpack) { SmallVector ShMask((unsigned)NumElements, -1); for (int i = AlignToUnpack; i < NumElements; ++i) ShMask[i - AlignToUnpack] = i; InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); Offset -= AlignToUnpack; } // Otherwise emit a sequence of unpacks. do { unsigned UnpackLoHi = X86ISD::UNPCKL; if (Offset >= (NumElements / 2)) { UnpackLoHi = X86ISD::UNPCKH; Offset -= (NumElements / 2); } MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; } while (Scale > 1); return DAG.getBitcast(VT, InputV); } /// \brief Try to lower a vector shuffle as a zero extension on any microarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't /// check for the profitability of this lowering, it tries to aggressively /// match this pattern. It will use all of the micro-architectural details it /// can to emit an efficient lowering. It handles both blends with all-zero /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to /// masking out later). /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); // Define a helper function to check a particular ext-scale and lower to it if // valid. auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; int Offset = 0; int Matches = 0; for (int i = 0; i < NumElements; ++i) { int M = Mask[i]; if (M < 0) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. if (!Zeroable[i]) return SDValue(); // We no longer are in the anyext case. AnyExt = false; continue; } // Each of the base elements needs to be consecutive indices into the // same input vector. SDValue V = M < NumElements ? V1 : V2; M = M % NumElements; if (!InputV) { InputV = V; Offset = M - (i / Scale); } else if (InputV != V) return SDValue(); // Flip-flopping inputs. // Offset must start in the lowest 128-bit lane or at the start of an // upper lane. // FIXME: Is it ever worth allowing a negative base offset? if (!((0 <= Offset && Offset < NumEltsPerLane) || (Offset % NumEltsPerLane) == 0)) return SDValue(); // If we are offsetting, all referenced entries must come from the same // lane. if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) return SDValue(); if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. Matches++; } // If we fail to find an input, we have a zero-shuffle which should always // have already been handled. // FIXME: Maybe handle this here in case during blending we end up with one? if (!InputV) return SDValue(); // If we are offsetting, don't extend if we only match a single input, we // can always do better by using a basic PSHUF or PUNPCK. if (Offset != 0 && Matches < 2) return SDValue(); return lowerVectorShuffleAsSpecificZeroOrAnyExtend( DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. assert(Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"); int NumExtElements = Bits / 64; // Each iteration, try extending the elements half as much, but into twice as // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } // General extends failed, but 128-bit vectors may be able to use MOVQ. if (Bits != 128) return SDValue(); // Returns one of the source operands if the shuffle can be reduced to a // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. auto CanZExtLowHalf = [&]() { for (int i = NumElements / 2; i != NumElements; ++i) if (!Zeroable[i]) return SDValue(); if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) return V1; if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) return V2; return SDValue(); }; if (SDValue V = CanZExtLowHalf()) { V = DAG.getBitcast(MVT::v2i64, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); return DAG.getBitcast(VT, V); } // No viable ext lowering found. return SDValue(); } /// \brief Try to get a scalar value for a specific element of a vector. /// /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG) { MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); V = peekThroughBitcasts(V); // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { // Ensure the scalar operand is the same size as the destination. // FIXME: Add support for scalar truncation where possible. SDValue S = V.getOperand(Idx); if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) return DAG.getBitcast(EltVT, S); } return SDValue(); } /// \brief Helper to test for a load that can be folded with x86 shuffles. /// /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { V = peekThroughBitcasts(V); return ISD::isNON_EXTLoad(V.getNode()); } /// \brief Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); int V2Index = find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (i != V2Index && !Zeroable[i]) { IsV1Zeroable = false; break; } // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG); if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { // Using zext to expand a narrow element won't work for non-zero // insertions. if (!IsV1Zeroable) return SDValue(); // Zero-extend directly to i32. ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32); V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || EltVT == MVT::i16) { // Either not inserting from the low element of the input or the input // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); } if (!IsV1Zeroable) { // If V1 can't be treated as a zero vector we have fewer options to lower // this. We can't support integer vectors or non-zero targets cheaply, and // the V1 elements can't be permuted in any way. assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); if (!VT.isFloatingPoint() || V2Index != 0) return SDValue(); SmallVector V1Mask(Mask.begin(), Mask.end()); V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); if (!VT.is128BitVector()) return SDValue(); // Otherwise, use MOVSD or MOVSS. assert((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"); return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, ExtVT, V1, V2); } // This lowering only works for the low element with floating point vectors. if (VT.isFloatingPoint() && V2Index != 0) return SDValue(); V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getBitcast(VT, V2); if (V2Index != 0) { // If we have 4 or fewer lanes we can cheaply shuffle the element into // the desired position. Otherwise it is more efficient to do a vector // shift left. We know that we can do a vector shift left because all // the inputs are zero. if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { SmallVector V2Shuffle(Mask.size(), 1); V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getBitcast(MVT::v16i8, V2); V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v16i8, V2, DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, DAG.getTargetLoweringInfo().getScalarShiftAmountTy( DAG.getDataLayout(), VT))); V2 = DAG.getBitcast(VT, V2); } } return V2; } /// Try to lower broadcast of a single - truncated - integer element, /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); EVT EltVT = VT.getVectorElementType(); EVT V0VT = V0.getValueType(); assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); EVT V0EltVT = V0VT.getVectorElementType(); if (!V0EltVT.isInteger()) return SDValue(); const unsigned EltSize = EltVT.getSizeInBits(); const unsigned V0EltSize = V0EltVT.getSizeInBits(); // This is only a truncation if the original element type is larger. if (V0EltSize <= EltSize) return SDValue(); assert(((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"); const unsigned V0Opc = V0.getOpcode(); const unsigned Scale = V0EltSize / EltSize; const unsigned V0BroadcastIdx = BroadcastIdx / Scale; if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && V0Opc != ISD::BUILD_VECTOR) return SDValue(); SDValue Scalar = V0.getOperand(V0BroadcastIdx); // If we're extracting non-least-significant bits, shift so we can truncate. // Hopefully, we can fold away the trunc/srl/load into the broadcast. // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. if (const int OffsetIdx = BroadcastIdx % Scale) Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType())); return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } /// \brief Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || (Subtarget.hasAVX() && VT.isFloatingPoint()) || (Subtarget.hasAVX2() && VT.isInteger()))) return SDValue(); // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. int BroadcastIdx = -1; for (int i = 0; i != (int)NumElts; ++i) { SmallVector BroadcastMask(NumElts, i); if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) { BroadcastIdx = i; break; } } if (BroadcastIdx < 0) return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. SDValue V = V1; for (;;) { switch (V.getOpcode()) { case ISD::BITCAST: { // Peek through bitcasts as long as BroadcastIdx can be adjusted. SDValue VSrc = V.getOperand(0); unsigned NumEltBits = V.getScalarValueSizeInBits(); unsigned NumSrcBits = VSrc.getScalarValueSizeInBits(); if ((NumEltBits % NumSrcBits) == 0) BroadcastIdx *= (NumEltBits / NumSrcBits); else if ((NumSrcBits % NumEltBits) == 0 && (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0) BroadcastIdx /= (NumSrcBits / NumEltBits); else break; V = VSrc; continue; } case ISD::CONCAT_VECTORS: { int OperandSize = Mask.size() / V.getNumOperands(); V = V.getOperand(BroadcastIdx / OperandSize); BroadcastIdx %= OperandSize; continue; } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); auto ConstantIdx = dyn_cast(V.getOperand(2)); if (!ConstantIdx) break; int BeginIdx = (int)ConstantIdx->getZExtValue(); int EndIdx = BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { BroadcastIdx -= BeginIdx; V = VInner; } else { V = VOuter; } continue; } } break; } // Ensure the source vector and BroadcastIdx are for a suitable type. if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) { unsigned NumEltBits = VT.getScalarSizeInBits(); unsigned NumSrcBits = V.getScalarValueSizeInBits(); if ((NumSrcBits % NumEltBits) == 0) BroadcastIdx *= (NumSrcBits / NumEltBits); else if ((NumEltBits % NumSrcBits) == 0 && (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0) BroadcastIdx /= (NumEltBits / NumSrcBits); else return SDValue(); unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts); V = DAG.getBitcast(SrcVT, V); } // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. // First, look through bitcast: if the original value has a larger element // type than the shuffle, the broadcast element is in essence truncated. // Make that explicit to ease folding. if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; MVT BroadcastVT = VT; // Peek through any bitcast (only useful for loads). SDValue BC = peekThroughBitcasts(V); // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); } else if (MayFoldLoad(BC) && !cast(BC)->isVolatile()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : Opcode; } // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. LoadSDNode *Ld = cast(BC); SDValue BaseAddr = Ld->getOperand(1); EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); DAG.makeEquivalentMemoryOrdering(Ld, V); } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); } else if (BroadcastIdx != 0) { // We can only broadcast from the zero-element of a vector register, // but it can be advantageous to broadcast from the zero-element of a // subvector. if (!VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); // VPERMQ/VPERMPD can perform the cross-lane shuffle directly. if (VT == MVT::v4f64 || VT == MVT::v4i64) return SDValue(); // Only broadcast the zero-element of a 128-bit subvector. unsigned EltSize = VT.getScalarSizeInBits(); if (((BroadcastIdx * EltSize) % 128) != 0) return SDValue(); // The shuffle input might have been a bitcast we looked through; look at // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll // later bitcast it to BroadcastVT. assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"); assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && "Unexpected vector size"); V = extract128BitVector(V, BroadcastIdx, DAG, DL); } if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, DAG.getBitcast(MVT::f64, V)); // Bitcast back to the same scalar type as BroadcastVT. MVT SrcVT = V.getSimpleValueType(); if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) { assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"); if (SrcVT.isVector()) { unsigned NumSrcElts = SrcVT.getVectorNumElements(); SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); } else { SrcVT = BroadcastVT.getScalarType(); } V = DAG.getBitcast(SrcVT, V); } // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && SrcVT == MVT::i64) { V = DAG.getBitcast(MVT::f64, V); unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements(); BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); } // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to // 128-bits, removing as many bitcasts as possible. if (SrcVT.getSizeInBits() > 128) { MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / SrcVT.getScalarSizeInBits()); V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); V = DAG.getBitcast(ExtVT, V); } return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } // Check for whether we can use INSERTPS to perform the shuffle. We only use // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Attempt to match INSERTPS with one element from VA or VB being // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask // are updated. auto matchAsInsertPS = [&](SDValue VA, SDValue VB, ArrayRef CandidateMask) { unsigned ZMask = 0; int VADstIndex = -1; int VBDstIndex = -1; bool VAUsedInPlace = false; for (int i = 0; i < 4; ++i) { // Synthesize a zero mask from the zeroable elements (includes undefs). if (Zeroable[i]) { ZMask |= 1 << i; continue; } // Flag if we use any VA inputs in place. if (i == CandidateMask[i]) { VAUsedInPlace = true; continue; } // We can only insert a single non-zeroable element. if (VADstIndex >= 0 || VBDstIndex >= 0) return false; if (CandidateMask[i] < 4) { // VA input out of place for insertion. VADstIndex = i; } else { // VB input for insertion. VBDstIndex = i; } } // Don't bother if we have no (non-zeroable) element for insertion. if (VADstIndex < 0 && VBDstIndex < 0) return false; // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. unsigned VBSrcIndex = 0; if (VADstIndex >= 0) { // If we have a VA input out of place, we use VA as the V2 element // insertion and don't use the original V2 at all. VBSrcIndex = CandidateMask[VADstIndex]; VBDstIndex = VADstIndex; VB = VA; } else { VBSrcIndex = CandidateMask[VBDstIndex] - 4; } // If no V1 inputs are used in place, then the result is created only from // the zero mask and the V2 insertion - so remove V1 dependency. if (!VAUsedInPlace) VA = DAG.getUNDEF(MVT::v4f32); // Update V1, V2 and InsertPSMask accordingly. V1 = VA; V2 = VB; // Insert the V2 element into the desired position. InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); return true; }; if (matchAsInsertPS(V1, V2, Mask)) return true; // Commute and try again. SmallVector CommutedMask(Mask.begin(), Mask.end()); ShuffleVectorSDNode::commuteMask(CommutedMask); if (matchAsInsertPS(V2, V1, CommutedMask)) return true; return false; } static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); // Attempt to match the insertps pattern. unsigned InsertPSMask; if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) return SDValue(); // Insert the V2 element into the desired position. return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getConstant(InsertPSMask, DL, MVT::i8)); } /// \brief Try to lower a shuffle as a permute of the inputs followed by an /// UNPCK instruction. /// /// This specifically targets cases where we end up with alternating between /// the two inputs, and so can permute them into something that feeds a single /// UNPCK instruction. Note that this routine only targets integer vectors /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "This routine only supports integer vectors."); assert(VT.is128BitVector() && "This routine only works on 128-bit vectors."); assert(!V2.isUndef() && "This routine should only be used when blending two inputs."); assert(Mask.size() >= 2 && "Single element masks are invalid."); int Size = Mask.size(); int NumLoInputs = count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; }); int NumHiInputs = count_if(Mask, [Size](int M) { return M % Size >= Size / 2; }); bool UnpackLo = NumLoInputs >= NumHiInputs; auto TryUnpack = [&](int ScalarSize, int Scale) { SmallVector V1Mask((unsigned)Size, -1); SmallVector V2Mask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; // Each element of the unpack contains Scale elements from this mask. int UnpackIdx = i / Scale; // We only handle the case where V1 feeds the first slots of the unpack. // We rely on canonicalization to ensure this is the case. if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) return SDValue(); // Setup the mask for this input. The indexing is tricky as we have to // handle the unpack stride. SmallVectorImpl &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size; } // If we will have to shuffle both inputs to use the unpack, check whether // we can just unpack first and shuffle the result. If so, skip this unpack. if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) return SDValue(); // Shuffle the inputs into place. V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); // Cast the inputs to the type we will use to unpack them. MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale); V1 = DAG.getBitcast(UnpackVT, V1); V2 = DAG.getBitcast(UnpackVT, V2); // Unpack the inputs and cast the result back to the desired type. return DAG.getBitcast( VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, UnpackVT, V1, V2)); }; // We try each unpack from the largest to the smallest to try and find one // that fits this mask. int OrigScalarSize = VT.getScalarSizeInBits(); for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize)) return Unpack; // If none of the unpack-rooted lowerings worked (or were profitable) try an // initial unpack. if (NumLoInputs == 0 || NumHiInputs == 0) { assert((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"); int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; // FIXME: We could consider the total complexity of the permute of each // possible unpacking. Or at the least we should consider how many // half-crossings are created. // FIXME: We could consider commuting the unpacks. SmallVector PermMask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); PermMask[i] = 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); } return DAG.getVectorShuffle( VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT, V1, V2), DAG.getUNDEF(VT), PermMask); } return SDValue(); } /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full /// support for floating point shuffles but not integer shuffles. These /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } return DAG.getNode( X86ISD::SHUFP, DL, MVT::v2f64, Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); // If we have a single input, insert that into V1 if we can do so cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; } // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || isShuffleEquivalent(V1, V2, Mask, {1, 3})) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. return DAG.getNode( isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, DL, MVT::v2f64, V2, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget.hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } /// \brief Handle lowering of 2-lane 64-bit integer shuffles. /// /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); int WidenedMask[4] = { std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); } assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. V1 = DAG.getBitcast(MVT::v2f64, V1); V2 = DAG.getBitcast(MVT::v2f64, V2); return DAG.getBitcast(MVT::v2i64, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// \brief Test whether this can be lowered with a single SHUFPS instruction. /// /// This is used to disable more specialized lowerings when the shufps lowering /// will happen to be efficient. static bool isSingleSHUFPSMask(ArrayRef Mask) { // This routine only handles 128-bit shufps. assert(Mask.size() == 4 && "Unsupported mask size!"); assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); // To lower with a single SHUFPS we need to have the low half and high half // each requiring a single input. if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) return false; if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) return false; return true; } /// \brief Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 1) { int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; if (Mask[V2AdjIndex] < 0) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. if (V2Index < 2) std::swap(LowV, HighV); NewMask[V2Index] -= 4; } else { // Handle the case where the V2 element ends up adjacent to a V1 element. // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. if (V2Index < 2) { LowV = V2; HighV = V1; } else { HighV = V2; } NewMask[V1Index] = 2; // We put the V1 element in V2[2]. NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; } else if (Mask[2] < 4 && Mask[3] < 4) { // We also handle the reversed case because this utility may get called // when we detect a SHUFPS pattern but can't easily commute the shuffle to // arrange things in the right direction. NewMask[0] -= 4; NewMask[1] -= 4; HighV = V1; LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final // shuffle to place them. // The first two blend mask elements are for V1, the second two are for // V2. int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. LowV = HighV = V1; NewMask[0] = Mask[0] < 4 ? 0 : 2; NewMask[1] = Mask[0] < 4 ? 2 : 0; NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); } /// \brief Lower 4-lane 32-bit floating point shuffles. /// /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. if (Subtarget.hasSSE3()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid // in SSE1 because otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1})) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1); if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3})) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1); } // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue V = lowerVectorShuffleAsElementInsertion( DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( DL, MVT::v4f32, V1, V2, Mask, DAG)) return BlendPerm; } // Use low/high mov instructions. These are only valid in SSE1 because // otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7})) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions // but we aren't actually going to use the UNPCK instruction because doing // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) Mask = UnpackLoMask; else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, Zeroable, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (!isSingleSHUFPSMask(Mask)) { // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, DAG); // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( DL, MVT::v4i32, V1, V2, Mask, DAG)) return Unpack; } // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would incur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2); SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask); return DAG.getBitcast(MVT::v4i32, ShufPS); } /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 /// shuffle lowering, and the most complex part. /// /// The lowering strategy is to try to form pairs of input lanes which are /// targeted at the same half of the final vector, and then use a dword shuffle /// to place them onto the right half, and finally unpack the paired lanes into /// their final position. /// /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. /// /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputVectorShuffle( const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doesn't match!"); MutableArrayRef LoMask = Mask.slice(0, 4); MutableArrayRef HiMask = Mask.slice(4, 4); SmallVector LoInputs; copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); // If we are splatting two values from one half - one to each half, then // we can shuffle that half so each is splatted to a dword, then splat those // to their respective halves. auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp, int DOffset) { int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4}; int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1}; V = DAG.getNode(ShufWOp, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); V = DAG.getBitcast(PSHUFDVT, V); V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V, getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); return DAG.getBitcast(VT, V); }; if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0) return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0); if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0) return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2); // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through // to the generic code below. For example: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half // and an existing 2-into-2 on the other half. In this case we may have to // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. // Fortunately, we don't have to handle anything but a 2-into-2 pattern // because any other situation (including a 3-into-1 or 1-into-3 in the other // half than the one we target for fixing) will be fixed when we re-enter this // path. We will also combine away any sequence of PSHUFD instructions that // result into a single instruction. Here is an example of the tricky case: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] // // This now has a 1-into-3 in the high half! Instead, we do two shuffles: // // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] // // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] // // The result is fine to be handled by the generic logic. auto balanceSides = [&](ArrayRef AToAInputs, ArrayRef BToAInputs, ArrayRef BToBInputs, ArrayRef AToBInputs, int AOffset, int BOffset) { assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."); assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."); assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); bool ThreeAInputs = AToAInputs.size() == 3; // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord, BDWord; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; ArrayRef TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); TripleDWord = TripleNonInputIdx / 2; // We use xor with one to compute the adjacent DWord to whichever one the // OneInput is in. OneInputDWord = (OneInput / 2) ^ 1; // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA // and BToA inputs. If there is also such a problem with the BToB and AToB // inputs, we don't try to fix it necessarily -- we'll recurse and see it in // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it // is essential that we don't *create* a 3<-1 as then we might oscillate. if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { // Compute how many inputs will be flipped by swapping these DWords. We // need // to balance this to ensure we don't form a 3-1 shuffle in the other // half. int NumFlippedAToBInputs = std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); int NumFlippedBToBInputs = std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); if ((NumFlippedAToBInputs == 1 && (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || (NumFlippedBToBInputs == 1 && (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { // We choose whether to fix the A half or B half based on whether that // half has zero flipped inputs. At zero, we may not be able to fix it // with that half. We also bias towards fixing the B half because that // will more commonly be the high half, and we have to bias one way. auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, ArrayRef Inputs) { int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1); // Determine whether the free index is in the flipped dword or the // unflipped dword based on where the pinned index is. We use this bit // in an xor to conditionally select the adjacent dword. int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); if (IsFixIdxInput == IsFixFreeIdxInput) FixFreeIdx += 1; IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); assert(IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); V = DAG.getNode( FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) if (M >= 0 && M == FixIdx) M = FixFreeIdx; else if (M >= 0 && M == FixFreeIdx) M = FixIdx; }; if (NumFlippedBToBInputs != 0) { int BPinnedIdx = BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } } int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) if (M >= 0 && M/2 == ADWord) M = 2 * BDWord + M % 2; else if (M >= 0 && M/2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and // those dwords can then be moved to the correct half with a dword shuffle. // We use at most one low and one high word shuffle to collect these paired // inputs into dwords, and finally a dword shuffle to place them. int PSHUFLMask[4] = {-1, -1, -1, -1}; int PSHUFHMask[4] = {-1, -1, -1, -1}; int PSHUFDMask[4] = {-1, -1, -1, -1}; // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. auto fixInPlaceInputs = [&PSHUFDMask](ArrayRef InPlaceInputs, ArrayRef IncomingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } if (IncomingInputs.empty()) { // Just fix all of the in place inputs. for (int Input : InPlaceInputs) { SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; PSHUFDMask[Input / 2] = Input / 2; } return; } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; // Put the second input next to the first so that they are packed into // a dword. We find the adjacent index by toggling the low bit. int AdjIndex = InPlaceInputs[0] ^ 1; SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, MutableArrayRef FinalSourceHalfMask, int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word; }; auto isDWordClobbered = [&isWordClobbered](ArrayRef SourceHalfMask, int Word) { int LowWord = Word & ~1; int HighWord = Word | 1; return isWordClobbered(SourceHalfMask, LowWord) || isWordClobbered(SourceHalfMask, HighWord); }; if (IncomingInputs.empty()) return; if (ExistingInputs.empty()) { // Map any dwords with inputs from them into the right half. for (int Input : IncomingInputs) { // If the source half mask maps over the inputs, turn those into // swaps and use the swapped lane. if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) { SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; } else { assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"); } // Note that this correctly re-maps both when we do a swap and when // we observe the other side of the swap above. We rely on that to // avoid swapping the members of the input list directly. Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; } // Map the input's dword into the correct half. if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0) PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; else assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"); } // And just directly shift any other-half mask elements to be same-half // as we will have mirrored the dword containing the element into the // same position within that half. for (int &M : HalfMask) if (M >= SourceOffset && M < SourceOffset + 4) { M = M - SourceOffset + DestOffset; assert(M >= 0 && "This should never wrap below zero!"); } return; } // Ensure we have the input in a viable dword of its current half. This // is particularly tricky because the original position may be clobbered // by inputs being moved and *staying* in that half. if (IncomingInputs.size() == 1) { if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) + SourceOffset; SourceHalfMask[InputFixed - SourceOffset] = IncomingInputs[0] - SourceOffset; std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], InputFixed); IncomingInputs[0] = InputFixed; } } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { // We have two non-adjacent or clobbered inputs we need to extract from // the source half. To do this, we need to map them into some adjacent // dword slot in the source mask. int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, IncomingInputs[1] - SourceOffset}; // If there is a free slot in the source half mask adjacent to one of // the inputs, place the other input in it. We use (Index XOR 1) to // compute an adjacent index. if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && SourceHalfMask[InputsFixed[0] ^ 1] < 0) { SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; InputsFixed[1] = InputsFixed[0] ^ 1; } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && SourceHalfMask[InputsFixed[1] ^ 1] < 0) { SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; InputsFixed[0] = InputsFixed[1] ^ 1; } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 && SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) { // The two inputs are in the same DWord but it is clobbered and the // adjacent DWord isn't used at all. Move both inputs to the free // slot. SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; } else { // The only way we hit this point is if there is no clobbering // (because there are no off-half inputs to this half) and there is no // free slot adjacent to one of the inputs. In this case, we have to // swap an input with a non-input. for (int i = 0; i < 4; ++i) assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && "We can't handle any clobbers here!"); assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"); SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; // We also have to update the final source mask in this case because // it may need to undo the above swap. for (int &M : FinalSourceHalfMask) if (M == (InputsFixed[0] ^ 1) + SourceOffset) M = InputsFixed[1] + SourceOffset; else if (M == InputsFixed[1] + SourceOffset) M = (InputsFixed[0] ^ 1) + SourceOffset; InputsFixed[1] = InputsFixed[0] ^ 1; } // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) M = InputsFixed[1] + SourceOffset; IncomingInputs[0] = InputsFixed[0] + SourceOffset; IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); } // Now hoist the DWord down to the right half. int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; for (int &M : HalfMask) for (int Input : IncomingInputs) if (M == Input) M = FreeDWord * 2 + Input % 2; }; moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"); assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"); // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); // Do a half shuffle with the high mask after shifting its values down. for (int &M : HiMask) if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); return V; } /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the /// blend if only one input is used. static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { SDValue V1Mask[16]; SDValue V2Mask[16]; V1InUse = false; V2InUse = false; int Size = Mask.size(); int Scale = 16 / Size; for (int i = 0; i < 16; ++i) { if (Mask[i / Scale] < 0) { V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); } else { const int ZeroMask = 0x80; int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale : ZeroMask; int V2Idx = Mask[i / Scale] < Size ? ZeroMask : (Mask[i / Scale] - Size) * Scale + i % Scale; if (Zeroable[i / Scale]) V1Idx = V2Idx = ZeroMask; V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); V1InUse |= (ZeroMask != V1Idx); V2InUse |= (ZeroMask != V2Idx); } } if (V1InUse) V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, V1), DAG.getBuildVector(MVT::v16i8, DL, V1Mask)); if (V2InUse) V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, V2), DAG.getBuildVector(MVT::v16i8, DL, V2Mask)); // If we need shuffled inputs from both, blend the two. SDValue V; if (V1InUse && V2InUse) V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); else V = V1InUse ? V1 : V2; // Cast the result back to the correct type. return DAG.getBitcast(VT, V); } /// \brief Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with /// two inputs. The single input shuffles are immediately delegated to /// a dedicated lowering routine. /// /// The blends are lowered in one of three fundamental ways. If there are few /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle /// of the input is significantly cheaper when lowered as an interleaving of /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable, Subtarget, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, Subtarget, DAG)) return Rotate; // Make a copy of the mask so it can be modified. SmallVector MutableMask(Mask.begin(), Mask.end()); return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, MutableMask, Subtarget, DAG); } assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles."); // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion( DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue BitBlend = lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, Mask, DAG); } /// \brief Check whether a compaction lowering can be done by dropping even /// elements and compute how many times even elements must be dropped. /// /// This handles shuffles which take every Nth element where N is a power of /// two. Example shuffle masks: /// /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 /// /// Any of these lanes can of course be undef. /// /// This routine only supports N <= 3. /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here /// for larger N. /// /// \returns N above, or the number of times even elements must be dropped if /// there is such a number. Otherwise returns zero. static int canLowerByDroppingEvenElements(ArrayRef Mask, bool IsSingleInput) { // The modulus for the shuffle vector entries is based on whether this is // a single input or not. int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); assert(isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"); uint64_t ModMask = (uint64_t)ShuffleModulus - 1; // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, // and 2^3 simultaneously. This is because we may have ambiguity with // partially undef inputs. bool ViableForN[3] = {true, true, true}; for (int i = 0, e = Mask.size(); i < e; ++i) { // Ignore undef lanes, we'll optimistically collapse them to the pattern we // want. if (Mask[i] < 0) continue; bool IsAnyViable = false; for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) { uint64_t N = j + 1; // The shuffle mask must be equal to (i * 2^N) % M. if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) IsAnyViable = true; else ViableForN[j] = false; } // Early exit if we exhaust the possible powers of two. if (!IsAnyViable) break; } for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) return j + 1; // Return 0 as there is no viable power of two. return 0; } static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); } /// \brief Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to /// detect any complexity reducing interleaving. If that doesn't help, it uses /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use a zext lowering. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG)) return V; int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies // things significantly. Currently, this means we need to be able to // express the pre-duplication shuffle as an i16 shuffle. // // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef Mask) { for (int i = 0; i < 16; i += 2) if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1]) return false; return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector LoInputs; copy_if(Mask, std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); bool TargetLo = LoInputs.size() >= HiInputs.size(); ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap LaneMap; for (int I : InPlaceInputs) { PreDupI16Shuffle[I/2] = I/2; LaneMap[I] = I; } int j = TargetLo ? 0 : 4, je = j + 4; for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { // Check if j is already a shuffle of this input. This happens when // there are two adjacent bytes after we move the low one. if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { // If we haven't yet mapped the input, search for a slot into which // we can map it. while (j < je && PreDupI16Shuffle[j] >= 0) ++j; if (j == je) // We can't place the inputs into a single half with a simple i16 shuffle, so bail. return SDValue(); // Map this input with the i16 shuffle. PreDupI16Shuffle[j] = MovingInputs[i] / 2; } // Update the lane map based on the mapping we ended up with. LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } V1 = DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, V1, V1); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) { int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); if (PostDupI16Shuffle[i / 2] < 0) PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"); } return DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); }; if (SDValue V = tryToWidenViaDuplication()) return V; } if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any // blends but after all of the single-input lowerings. If the single input // lowerings can find an instruction sequence that is faster than a PSHUFB, we // want to preserve that and we can DAG combine any longer sequences into // a PSHUFB in the end. But once we start blending from multiple inputs, // the complexity of DAG combining bad patterns back into PSHUFB is too high, // and there are *very* few patterns that would actually be faster than the // PSHUFB approach because of its ability to zero lanes. // // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. if (Subtarget.hasSSSE3()) { bool V1InUse = false; bool V2InUse = false; SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs( DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, // do so. This avoids using them to handle blends-with-zero which is // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { if (Subtarget.hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // We can use an unpack to do the blending rather than an or in some // cases. Even though the or may be (very minorly) more efficient, we // preference this lowering because there are common cases where part of // the complexity of the shuffles goes away when we do the final blend as // an unpack. // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, DAG)) return Unpack; // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); } return PSHUFB; } // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue BitBlend = lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) return BitBlend; // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. // // We special case these as they can be particularly efficiently handled with // the PACKUSB instruction on x86 and they show up in common patterns of // rearranging bytes to truncate wide elements. bool IsSingleInput = V2.isUndef(); if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) { // NumEvenDrops is the power of two stride of the elements. Another way of // thinking about it is that we need to drop the even elements this many // times to get the original input. // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."); // We use the mask type to pick which bytes are preserved based on how many // elements are dropped. MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; SDValue ByteClearMask = DAG.getBitcast( MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); // Now pack things back together. V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2); SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); for (int i = 1; i < NumEvenDrops; ++i) { Result = DAG.getBitcast(MVT::v8i16, Result); Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); } return Result; } // Handle multi-input cases by blending single-input shuffles. if (NumV2Elements > 0) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask, DAG); // The fallback path for single-input shuffles widens this into two v8i16 // vectors with unpacks, shuffles those, and then pulls them back together // with a pack. SDValue V = V1; std::array LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; std::array HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; SDValue VLoHalf, VHiHalf; // Check if any of the odd lanes in the v16i8 are used. If not, we can mask // them out and avoid using UNPCK{L,H} to extract the elements of V as // i16s. if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) && none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) { // Use a mask to drop the high bytes. VLoHalf = DAG.getBitcast(MVT::v8i16, V); VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, DAG.getConstant(0x00FF, DL, MVT::v8i16)); // This will be a single vector shuffle instead of a blend so nuke VHiHalf. VHiHalf = DAG.getUNDEF(MVT::v8i16); // Squash the masks to point directly into VLoHalf. for (int &M : LoBlendMask) if (M >= 0) M /= 2; for (int &M : HiBlendMask) if (M >= 0) M /= 2; } else { // Otherwise just unpack the low half of V into VLoHalf and the high half into // VHiHalf so that we can blend them as i16s. SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL); VLoHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); VHiHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); } SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v2f64: return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i32: return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4f32: return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i16: return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i8: return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); } } /// \brief Generic routine to split vector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); assert(V2.getSimpleValueType() == VT && "Bad operand type!"); ArrayRef LoMask = Mask.slice(0, Mask.size() / 2); ArrayRef HiMask = Mask.slice(Mask.size() / 2); int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; MVT ScalarVT = VT.getVectorElementType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); // Rather than splitting build-vectors, just build two narrower build // vectors. This helps shuffling with splats and zeros. auto SplitVector = [&](SDValue V) { V = peekThroughBitcasts(V); MVT OrigVT = V.getSimpleValueType(); int OrigNumElements = OrigVT.getVectorNumElements(); int OrigSplitNumElements = OrigNumElements / 2; MVT OrigScalarVT = OrigVT.getVectorElementType(); MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); SDValue LoV, HiV; auto *BV = dyn_cast(V); if (!BV) { LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, DAG.getIntPtrConstant(0, DL)); HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, DAG.getIntPtrConstant(OrigSplitNumElements, DL)); } else { SmallVector LoOps, HiOps; for (int i = 0; i < OrigSplitNumElements; ++i) { LoOps.push_back(BV->getOperand(i)); HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); } LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps); HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps); } return std::make_pair(DAG.getBitcast(SplitVT, LoV), DAG.getBitcast(SplitVT, HiV)); }; SDValue LoV1, HiV1, LoV2, HiV2; std::tie(LoV1, HiV1) = SplitVector(V1); std::tie(LoV2, HiV2) = SplitVector(V2); // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef HalfMask) { bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; SmallVector V1BlendMask((unsigned)SplitNumElements, -1); SmallVector V2BlendMask((unsigned)SplitNumElements, -1); SmallVector BlendMask((unsigned)SplitNumElements, -1); for (int i = 0; i < SplitNumElements; ++i) { int M = HalfMask[i]; if (M >= NumElements) { if (M >= NumElements + SplitNumElements) UseHiV2 = true; else UseLoV2 = true; V2BlendMask[i] = M - NumElements; BlendMask[i] = SplitNumElements + i; } else if (M >= 0) { if (M >= SplitNumElements) UseHiV1 = true; else UseLoV1 = true; V1BlendMask[i] = M; BlendMask[i] = i; } } // Because the lowering happens after all combining takes place, we need to // manually combine these blend masks as much as possible so that we create // a minimal number of high-level vector shuffle nodes. // First try just blending the halves of V1 or V2. if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) return DAG.getUNDEF(SplitVT); if (!UseLoV2 && !UseHiV2) return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); if (!UseLoV1 && !UseHiV1) return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); SDValue V1Blend, V2Blend; if (UseLoV1 && UseHiV1) { V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); } else { // We only use half of V1 so map the usage down into the final blend mask. V1Blend = UseLoV1 ? LoV1 : HiV1; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); } if (UseLoV2 && UseHiV2) { V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); } else { // We only use half of V2 so map the usage down into the final blend mask. V2Blend = UseLoV2 ? LoV2 : HiV2; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= SplitNumElements) BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); } return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); }; SDValue Lo = HalfBlend(LoMask); SDValue Hi = HalfBlend(HiMask); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } /// \brief Either split a vector in halves or decompose the shuffles and the /// blend. /// /// This is provided as a good fallback for many lowerings of non-single-input /// shuffles with more than one 128-bit lane. In those cases, we want to select /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself."); int Size = Mask.size(); // If this can be modeled as a broadcast of two elements followed by a blend, // prefer that lowering. This is especially important because broadcasts can // often fold with memory operands. auto DoBothBroadcast = [&] { int V1BroadcastIdx = -1, V2BroadcastIdx = -1; for (int M : Mask) if (M >= Size) { if (V2BroadcastIdx < 0) V2BroadcastIdx = M - Size; else if (M - Size != V2BroadcastIdx) return false; } else if (M >= 0) { if (V1BroadcastIdx < 0) V1BroadcastIdx = M; else if (M != V1BroadcastIdx) return false; } return true; }; if (DoBothBroadcast()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to // unusually few instructions. int LaneCount = VT.getSizeInBits() / 128; int LaneSize = Size / LaneCount; SmallBitVector LaneInputs[2]; LaneInputs[0].resize(LaneCount, false); LaneInputs[1].resize(LaneCount, false); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); // Otherwise, just fall back to decomposed shuffles and a blend. This requires // that the decomposed single-input shuffles don't end up here. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); } /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as /// a permutation and blend of those lanes. /// /// This essentially blends the out-of-lane inputs to each lane into the lane /// from a permuted copy of the vector. This lowering strategy results in four /// instructions in the worst case for a single-input cross lane shuffle which /// is lower than any other fully general cross-lane shuffle strategy I'm aware /// of. Special cases for each particular shuffle pattern should be handled /// prior to trying this lowering. static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int Size = Mask.size(); int LaneSize = Size / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. if (!Subtarget.hasAVX2()) { bool LaneCrossing[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } else { bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneUsed[(Mask[i] / LaneSize)] = true; if (!LaneUsed[0] || !LaneUsed[1]) return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } assert(V2.isUndef() && "This last part of this routine only works on single input shuffles"); SmallVector FlippedBlendMask(Size); for (int i = 0; i < Size; ++i) FlippedBlendMask[i] = Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) ? Mask[i] : Mask[i] % LaneSize + (i / LaneSize) * LaneSize + Size); // Flip the vector, and blend the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Flipped = DAG.getBitcast(PVT, V1); Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), { 2, 3, 0, 1 }); Flipped = DAG.getBitcast(VT, Flipped); return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); } /// \brief Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding. if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); SmallVector WidenedMask; if (!canWidenShuffleElements(Mask, WidenedMask)) return SDValue(); // TODO: If minimizing size and one of the inputs is a zero vector and the // the zero vector has only one use, we could use a VPERM2X128 to save the // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; bool IsLowZero = (Zeroable & 0x3) == 0x3; bool IsHighZero = (Zeroable & 0xc) == 0xc; // If either input operand is a zero vector, use VPERM2X128 because its mask // allows us to replace the zero input with an implicit zero. if (!IsLowZero && !IsHighZero) { // Check for patterns which can be matched with a single insert of a 128-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise, // this will likely become vinsertf128 which can't fold a 256-bit memop. if (!isa(peekThroughBitcasts(V1))) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } } // Try to use SHUF128 if possible. if (Subtarget.hasVLX()) { if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { unsigned PermMask = ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1); return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, DAG.getConstant(PermMask, DL, MVT::i8)); } } } // Otherwise form a 128-bit permutation. After accounting for undefs, // convert the 64-bit shuffle mask selection values into 128-bit // selection bits by dividing the indexes by 2 and shifting into positions // defined by a vperm2*128 instruction's immediate control byte. // The immediate permute control byte looks like this: // [1:0] - select 128 bits from sources for low half of destination // [2] - ignore // [3] - zero low half of destination // [5:4] - select 128 bits from sources for high half of destination // [6] - ignore // [7] - zero high half of destination assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?"); unsigned PermMask = 0; PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0); PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4); // Check the immediate mask and replace unused sources with undef. if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00) V1 = DAG.getUNDEF(VT); if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20) V2 = DAG.getUNDEF(VT); return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getConstant(PermMask, DL, MVT::i8)); } /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// /// This will only succeed when the result of fixing the 128-bit lanes results /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in /// each 128-bit lanes. This handles many cases where we can quickly blend away /// the lane crosses early and then use simpler shuffles within each lane. /// /// FIXME: It might be worthwhile at some point to support this without /// requiring the 128-bit lane-relative shuffles to be repeating, but currently /// in x86 only floating point has interesting non-repeating shuffles, and even /// those are still *marginally* more expensive. static SDValue lowerVectorShuffleByMerging128BitLanes( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This is only useful with multiple inputs."); int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); int NumLanes = Size / LaneSize; assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also // check whether the in-128-bit lane shuffles share a repeating pattern. SmallVector Lanes((unsigned)NumLanes, -1); SmallVector InLaneMask((unsigned)LaneSize, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; int j = i / LaneSize; if (Lanes[j] < 0) { // First entry we've seen for this lane. Lanes[j] = Mask[i] / LaneSize; } else if (Lanes[j] != Mask[i] / LaneSize) { // This doesn't match the lane selected previously! return SDValue(); } // Check that within each lane we have a consistent shuffle mask. int k = i % LaneSize; if (InLaneMask[k] < 0) { InLaneMask[k] = Mask[i] % LaneSize; } else if (InLaneMask[k] != Mask[i] % LaneSize) { // This doesn't fit a repeating in-lane mask. return SDValue(); } } // First shuffle the lanes into place. MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, VT.getSizeInBits() / 64); SmallVector LaneMask((unsigned)NumLanes * 2, -1); for (int i = 0; i < NumLanes; ++i) if (Lanes[i] >= 0) { LaneMask[2 * i + 0] = 2*Lanes[i] + 0; LaneMask[2 * i + 1] = 2*Lanes[i] + 1; } V1 = DAG.getBitcast(LaneVT, V1); V2 = DAG.getBitcast(LaneVT, V2); SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); // Cast it back to the type we actually want. LaneShuffle = DAG.getBitcast(VT, LaneShuffle); // Now do a simple shuffle that isn't lane crossing. SmallVector NewMask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && "Must not introduce lane crosses at this point!"); return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. /// This allows for fast cases such as subvector extraction/insertion /// or shuffling smaller vector types which can lower more efficiently. static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned HalfNumElts = NumElts / 2; MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); if (!UndefLower && !UndefUpper) return SDValue(); // Upper half is undef and lower half is whole upper subvector. // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> if (UndefUpper && isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(HalfNumElts, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(0, DL)); } // Lower half is undef and upper half is whole lower subvector. // e.g. vector_shuffle or if (UndefLower && isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(HalfNumElts, DL)); } // If the shuffle only uses two of the four halves of the input operands, // then extract them and perform the 'half' shuffle at half width. // e.g. vector_shuffle or int HalfIdx1 = -1, HalfIdx2 = -1; SmallVector HalfMask(HalfNumElts); unsigned Offset = UndefLower ? HalfNumElts : 0; for (unsigned i = 0; i != HalfNumElts; ++i) { int M = Mask[i + Offset]; if (M < 0) { HalfMask[i] = M; continue; } // Determine which of the 4 half vectors this element is from. // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. int HalfIdx = M / HalfNumElts; // Determine the element index into its half vector source. int HalfElt = M % HalfNumElts; // We can shuffle with up to 2 half vectors, set the new 'half' // shuffle mask accordingly. if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) { HalfMask[i] = HalfElt; HalfIdx1 = HalfIdx; continue; } if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) { HalfMask[i] = HalfElt + HalfNumElts; HalfIdx2 = HalfIdx; continue; } // Too many half vectors referenced. return SDValue(); } assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); // Only shuffle the halves of the inputs when useful. int NumLowerHalves = (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); int NumUpperHalves = (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); // uuuuXXXX - don't extract uppers just to insert again. if (UndefLower && NumUpperHalves != 0) return SDValue(); // XXXXuuuu - don't extract both uppers, instead shuffle and then extract. if (UndefUpper && NumUpperHalves == 2) return SDValue(); // AVX2 - XXXXuuuu - always extract lowers. if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) { // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. if (VT == MVT::v4f64 || VT == MVT::v4i64) return SDValue(); // AVX2 supports variable 32-bit element cross-lane shuffles. if (VT == MVT::v8f32 || VT == MVT::v8i32) { // XXXXuuuu - don't extract lowers and uppers. if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0) return SDValue(); } } // AVX512 - XXXXuuuu - always extract lowers. if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0)) return SDValue(); auto GetHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); SDValue V = (HalfIdx < 2 ? V1 : V2); HalfIdx = (HalfIdx % 2) * HalfNumElts; return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, DAG.getIntPtrConstant(HalfIdx, DL)); }; SDValue Half1 = GetHalfVector(HalfIdx1); SDValue Half2 = GetHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); } /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// /// This returns true if the elements from a particular input are already in the /// slot required by the given mask and require no permutation. static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) return false; return true; } /// Handle case where shuffle sources are coming from the same 128-bit lane and /// every lane can be represented as the same repeating mask - allowing us to /// shuffle the sources with the repeating shuffle and then permute the result /// to the destination lanes. static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; // On AVX2 we may be able to just shuffle the lowest elements and then // broadcast the result. if (Subtarget.hasAVX2()) { for (unsigned BroadcastSize : {16, 32, 64}) { if (BroadcastSize <= VT.getScalarSizeInBits()) continue; int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits(); // Attempt to match a repeating pattern every NumBroadcastElts, // accounting for UNDEFs but only references the lowest 128-bit // lane of the inputs. auto FindRepeatingBroadcastMask = [&](SmallVectorImpl &RepeatMask) { for (int i = 0; i != NumElts; i += NumBroadcastElts) for (int j = 0; j != NumBroadcastElts; ++j) { int M = Mask[i + j]; if (M < 0) continue; int &R = RepeatMask[j]; if (0 != ((M % NumElts) / NumLaneElts)) return false; if (0 <= R && R != M) return false; R = M; } return true; }; SmallVector RepeatMask((unsigned)NumElts, -1); if (!FindRepeatingBroadcastMask(RepeatMask)) continue; // Shuffle the (lowest) repeated elements in place for broadcast. SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask); // Shuffle the actual broadcast. SmallVector BroadcastMask((unsigned)NumElts, -1); for (int i = 0; i != NumElts; i += NumBroadcastElts) for (int j = 0; j != NumBroadcastElts; ++j) BroadcastMask[i + j] = j; return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT), BroadcastMask); } } // Bail if the shuffle mask doesn't cross 128-bit lanes. if (!is128BitLaneCrossingShuffleMask(VT, Mask)) return SDValue(); // Bail if we already have a repeated lane shuffle mask. SmallVector RepeatedShuffleMask; if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask)) return SDValue(); // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes. int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1; int NumSubLanes = NumLanes * SubLaneScale; int NumSubLaneElts = NumLaneElts / SubLaneScale; // Check that all the sources are coming from the same lane and see if we can // form a repeating shuffle mask (local to each sub-lane). At the same time, // determine the source sub-lane for each destination sub-lane. int TopSrcSubLane = -1; SmallVector Dst2SrcSubLanes((unsigned)NumSubLanes, -1); SmallVector RepeatedSubLaneMasks[2] = { SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef), SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef)}; for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) { // Extract the sub-lane mask, check that it all comes from the same lane // and normalize the mask entries to come from the first lane. int SrcLane = -1; SmallVector SubLaneMask((unsigned)NumSubLaneElts, -1); for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { int M = Mask[(DstSubLane * NumSubLaneElts) + Elt]; if (M < 0) continue; int Lane = (M % NumElts) / NumLaneElts; if ((0 <= SrcLane) && (SrcLane != Lane)) return SDValue(); SrcLane = Lane; int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts); SubLaneMask[Elt] = LocalM; } // Whole sub-lane is UNDEF. if (SrcLane < 0) continue; // Attempt to match against the candidate repeated sub-lane masks. for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) { auto MatchMasks = [NumSubLaneElts](ArrayRef M1, ArrayRef M2) { for (int i = 0; i != NumSubLaneElts; ++i) { if (M1[i] < 0 || M2[i] < 0) continue; if (M1[i] != M2[i]) return false; } return true; }; auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane]; if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask)) continue; // Merge the sub-lane mask into the matching repeated sub-lane mask. for (int i = 0; i != NumSubLaneElts; ++i) { int M = SubLaneMask[i]; if (M < 0) continue; assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && "Unexpected mask element"); RepeatedSubLaneMask[i] = M; } // Track the top most source sub-lane - by setting the remaining to UNDEF // we can greatly simplify shuffle matching. int SrcSubLane = (SrcLane * SubLaneScale) + SubLane; TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); Dst2SrcSubLanes[DstSubLane] = SrcSubLane; break; } // Bail if we failed to find a matching repeated sub-lane mask. if (Dst2SrcSubLanes[DstSubLane] < 0) return SDValue(); } assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && "Unexpected source lane"); // Create a repeating shuffle mask for the entire vector. SmallVector RepeatedMask((unsigned)NumElts, -1); for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) { int Lane = SubLane / SubLaneScale; auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale]; for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { int M = RepeatedSubLaneMask[Elt]; if (M < 0) continue; int Idx = (SubLane * NumSubLaneElts) + Elt; RepeatedMask[Idx] = M + (Lane * NumLaneElts); } } SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); // Shuffle each source sub-lane to its destination. SmallVector SubLaneMask((unsigned)NumElts, -1); for (int i = 0; i != NumElts; i += NumSubLaneElts) { int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; if (SrcSubLane < 0) continue; for (int j = 0; j != NumSubLaneElts; ++j) SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); } return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), SubLaneMask); } static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, unsigned &ShuffleImm, ArrayRef Mask) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. ShuffleImm = 0; bool ShufpdMask = true; bool CommutableMask = true; for (int i = 0; i < NumElts; ++i) { if (Mask[i] == SM_SentinelUndef) continue; if (Mask[i] < 0) return false; int Val = (i & 6) + NumElts * (i & 1); int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1); if (Mask[i] < Val || Mask[i] > Val + 1) ShufpdMask = false; if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) CommutableMask = false; ShuffleImm |= (Mask[i] % 2) << i; } if (ShufpdMask) return true; if (CommutableMask) { std::swap(V1, V2); return true; } return false; } static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD"); unsigned Immediate = 0; if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) return SDValue(); return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, DAG.getConstant(Immediate, DL, MVT::i8)); } /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use low duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowered with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, DAG.getConstant(VPERMILPMask, DL, MVT::i8)); } // With AVX2 we have direct support for this permutation. if (Subtarget.hasAVX2()) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Try to create an in-lane repeating shuffle mask and then shuffle the // the results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) return Op; // Try to create an in-lane repeating shuffle mask and then shuffle the // the results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 4-lane 64-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on both lanes. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { SmallVector PSHUFDMask; scaleShuffleMask(2, RepeatedMask, PSHUFDMask); return DAG.getBitcast( MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, DAG.getBitcast(MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } // AVX2 provides a direct instruction for permuting a single input across // lanes. return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or VEXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; } // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!isShuffleMaskInputInPlace(0, Mask) && !isShuffleMaskInputInPlace(1, Mask)) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 8-lane 32-bit floating point shuffles. /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"); // Use even/odd duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } // Try to create an in-lane repeating shuffle mask and then shuffle the // the results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return V; // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. if (V2.isUndef()) { SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask); if (Subtarget.hasAVX2()) return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG)) return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 8-lane 32-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code than vblend by using // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && !Subtarget.hasAVX512()) if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG)) return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the two 128-bit // lanes. SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask); if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; } // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. if (V2.isUndef()) { SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v8i32, ShufPS); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 16-lane 16-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // the results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return V; if (V2.isUndef()) { // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v16 case. return lowerV8I16GeneralSingleInputVectorShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } } if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512BWVL can lower to VPERMW. if (Subtarget.hasBWI() && Subtarget.hasVLX()) return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); } /// \brief Handle lowering of 32-lane 8-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // the results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return V; // There are no generalized cross-lane shuffle operations available on i8 // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget); if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512VBMIVL can lower to VPERMB. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); } /// \brief High-level routine to lower various 256-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = VT.getVectorNumElements(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // There is a really nice hard cut-over between AVX1 and AVX2 that means we // can check for those subtargets here and avoid much of the subtarget // querying in the per-vector-type lowering routines. With AVX1 we have // essentially *zero* ability to manipulate a 256-bit vector with integer // types. Since we'll use floating point types there eventually, just // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget.hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); if (ElementBits < 32) { // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) return V; if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); V1 = DAG.getBitcast(FpVT, V1); V2 = DAG.getBitcast(FpVT, V2); return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); } switch (VT.SimpleTy) { case MVT::v4f64: return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i64: return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8f32: return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i32: return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i16: return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i8: return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); } } /// \brief Try to lower a vector shuffle as a 128-bit shuffles. static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); // To handle 256 bit vector requires VLX and most probably // function lowerV2X128VectorShuffle() is better solution. assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."); SmallVector WidenedMask; if (!canWidenShuffleElements(Mask, WidenedMask)) return SDValue(); // Check for patterns which can be matched with a single insert of a 256-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } assert(WidenedMask.size() == 4); // See if this is an insertion of the lower 128-bits of V2 into V1. bool IsInsert = true; int V2Index = -1; for (int i = 0; i < 4; ++i) { assert(WidenedMask[i] >= -1); if (WidenedMask[i] < 0) continue; // Make sure all V1 subvectors are in place. if (WidenedMask[i] < 4) { if (WidenedMask[i] != i) { IsInsert = false; break; } } else { // Make sure we only have a single V2 index and its the lowest 128-bits. if (V2Index >= 0 || WidenedMask[i] != 4) { IsInsert = false; break; } V2Index = i; } } if (IsInsert && V2Index >= 0) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, DAG.getIntPtrConstant(0, DL)); return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); } // Try to lower to to vshuf64x2/vshuf32x4. SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; unsigned PermMask = 0; // Insure elements came from the same Op. for (int i = 0; i < 4; ++i) { assert(WidenedMask[i] >= -1); if (WidenedMask[i] < 0) continue; SDValue Op = WidenedMask[i] >= 4 ? V2 : V1; unsigned OpIndex = i / 2; if (Ops[OpIndex].isUndef()) Ops[OpIndex] = Op; else if (Ops[OpIndex] != Op) return SDValue(); // Convert the 128-bit shuffle mask selection values into 128-bit selection // bits defined by a vshuf64x2 instruction's immediate control byte. PermMask |= (WidenedMask[i] % 4) << (i * 2); } return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], DAG.getConstant(PermMask, DL, MVT::i8)); } /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (V2.isUndef()) { // Use low duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) { // Non-half-crossing single input shuffles can be lowered with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) | ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) | ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, DAG.getConstant(VPERMILPMask, DL, MVT::i8)); } SmallVector RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); } if (SDValue Shuf128 = lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Shuf128; if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); // Use even/odd duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1); if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return Unpck; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } // If we have a single input shuffle with different shuffle patterns in the // 128-bit lanes and don't lane cross, use variable mask VPERMILPS. if (V2.isUndef() && !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask); } // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four // 128-bit lanes. SmallVector Repeated128Mask; if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { SmallVector PSHUFDMask; scaleShuffleMask(2, Repeated128Mask, PSHUFDMask); return DAG.getBitcast( MVT::v8i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, DAG.getBitcast(MVT::v16i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } SmallVector Repeated256Mask; if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } if (SDValue Shuf128 = lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Shuf128; // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the four 128-bit // lanes. SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask); if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use byte rotation instructions. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Assume that a single SHUFPS is faster than using a permv shuffle. // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) return V; // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (V2.isUndef()) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v32 case. return lowerV8I16GeneralSingleInputVectorShuffle( DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); } } if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) return V; // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // VBMI can use VPERMV/VPERMV3 byte shuffles. if (Subtarget.hasVBMI()) return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // the results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } /// \brief High-level routine to lower various 512-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = Mask.size(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have support for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i16: return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64i8: return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); } } // Lower vXi1 vector shuffles. // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); MVT ExtVT; switch (VT.SimpleTy) { default: llvm_unreachable("Expected a vector of i1 elements"); case MVT::v2i1: ExtVT = MVT::v2i64; break; case MVT::v4i1: ExtVT = MVT::v4i32; break; case MVT::v8i1: // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit // shuffle. ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64; break; case MVT::v16i1: ExtVT = MVT::v16i32; break; case MVT::v32i1: ExtVT = MVT::v32i16; break; case MVT::v64i1: ExtVT = MVT::v64i8; break; } if (ISD::isBuildVectorAllZeros(V1.getNode())) V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V1.getNode())) V1 = getOnesVector(ExtVT, DAG, DL); else V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); if (V2.isUndef()) V2 = DAG.getUNDEF(ExtVT); else if (ISD::isBuildVectorAllZeros(V2.getNode())) V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V2.getNode())) V2 = getOnesVector(ExtVT, DAG, DL); else V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask); // i1 was sign extended we can use X86ISD::CVT2MASK. int NumElems = VT.getVectorNumElements(); if ((Subtarget.hasBWI() && (NumElems >= 32)) || (Subtarget.hasDQI() && (NumElems < 32))) return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle); return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle); } /// Helper function that returns true if the shuffle mask should be /// commuted to improve canonicalization. static bool canonicalizeShuffleMaskWithCommute(ArrayRef Mask) { int NumElements = Mask.size(); int NumV1Elements = 0, NumV2Elements = 0; for (int M : Mask) if (M < 0) continue; else if (M < NumElements) ++NumV1Elements; else ++NumV2Elements; // Commute the shuffle as needed such that more elements come from V1 than // V2. This allows us to match the shuffle pattern strictly on how many // elements come from V1 without handling the symmetric cases. if (NumV2Elements > NumV1Elements) return true; assert(NumV1Elements > 0 && "No V1 indices"); if (NumV2Elements == 0) return false; // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. When that is tied, // ensure that the sum of indices for V1 is equal to or lower than the sum // indices for V2. When those are equal, try to ensure that the number of odd // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : Mask.slice(0, NumElements / 2)) if (M >= NumElements) ++LowV2Elements; else if (M >= 0) ++LowV1Elements; if (LowV2Elements > LowV1Elements) return true; if (LowV2Elements == LowV1Elements) { int SumV1Indices = 0, SumV2Indices = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= NumElements) SumV2Indices += i; else if (Mask[i] >= 0) SumV1Indices += i; if (SumV2Indices < SumV1Indices) return true; if (SumV2Indices == SumV1Indices) { int NumV1OddIndices = 0, NumV2OddIndices = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= NumElements) NumV2OddIndices += i % 2; else if (Mask[i] >= 0) NumV1OddIndices += i % 2; if (NumV2OddIndices < NumV1OddIndices) return true; } } } return false; } /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 /// vector shuffles. Most of the specific lowering strategies are encapsulated /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc DL(Op); bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); assert((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"); bool V1IsUndef = V1.isUndef(); bool V2IsUndef = V2.isUndef(); if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); // When we create a shuffle node we put the UNDEF node to second operand, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) return DAG.getCommutedVectorShuffle(*SVOp); // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef) for (int M : Mask) if (M >= NumElements) { SmallVector NewMask(Mask.begin(), Mask.end()); for (int &M : NewMask) if (M >= NumElements) M = -1; return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); } // Check for illegal shuffle mask element index values. int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; assert(llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index"); // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2); if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(Mask, WidenedMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); // Make sure that the new vector type is legal. For example, v2f64 isn't // legal on SSE1. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { V1 = DAG.getBitcast(NewVT, V1); V2 = DAG.getBitcast(NewVT, V2); return DAG.getBitcast( VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask)); } } // Commute the shuffle if it will improve canonicalization. if (canonicalizeShuffleMaskWithCommute(Mask)) return DAG.getCommutedVectorShuffle(*SVOp); // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is256BitVector()) return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is512BitVector()) return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (Is1BitVector) return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } /// \brief Try to lower a VSELECT instruction to a vector shuffle. static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); auto *CondBV = cast(Cond); // Only non-legal VSELECTs reach this lowering, convert those into generic // shuffles and re-use the shuffle lowering path for blends. SmallVector Mask; for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { SDValue CondElt = CondBV->getOperand(i); Mask.push_back( isa(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) : -1); } return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) return BlendOp; // If this VSELECT has a vector if i1 as a mask, it will be directly matched // with patterns on the mask registers on AVX-512. if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1) return Op; // Variable blends are only legal from SSE4.1 onward. if (!Subtarget.hasSSE41()) return SDValue(); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition // into an i1 condition so that we can use the mask-based 512-bit blend // instructions. if (VT.getSizeInBits() == 512) { SDValue Cond = Op.getOperand(0); // The vNi1 condition case should be handled above as it can be trivially // lowered. assert(Cond.getValueType().getScalarSizeInBits() == VT.getScalarSizeInBits() && "Should have a size-matched integer condition!"); // Build a mask by testing the condition against itself (tests for zero). MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond); // Now return a new VSELECT using the mask. return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2)); } // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. switch (VT.SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; case MVT::v32i8: // The byte blends for AVX vectors were introduced only in AVX2. if (Subtarget.hasAVX2()) return Op; return SDValue(); case MVT::v8i16: case MVT::v16i16: // FIXME: We should custom lower this by fixing the condition and using i8 // blends. return SDValue(); } } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the // result has a single use which is a store or a bitcast to i32. And in // the case of a store, it's not worth it if the index is a constant 0, // because a MOVSSmr can be used instead, which is smaller and faster. if (!Op.hasOneUse()) return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || isNullConstant(Op.getOperand(1))) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), Op.getOperand(1)); return DAG.getBitcast(MVT::f32, Extract); } if (VT == MVT::i32 || VT == MVT::i64) { // ExtractPS/pextrq works with constant index. if (isa(Op.getOperand(1))) return Op; } return SDValue(); } /// Extract one bit from mask vector, like v16i1 or v8i1. /// AVX-512 feature. static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Vec = Op.getOperand(0); SDLoc dl(Vec); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, // extend vector to VR512/128 if (!isa(Idx)) { unsigned NumElts = VecVT.getVectorNumElements(); // Extending v8i1/v16i1 to 512-bit get better performance on KNL // than extending to 128/256bit. MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } // Canonicalize result type to MVT::i32. if (EltVT != MVT::i32) { SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec, Idx); return DAG.getAnyExtOrTrunc(Extract, dl, EltVT); } unsigned IdxVal = cast(Idx)->getZExtValue(); // Extracts from element 0 are always allowed. if (IdxVal == 0) return Op; // If the kshift instructions of the correct width aren't natively supported // then we need to promote the vector to the native size to get the correct // zeroing behavior. if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) || (VecVT.getVectorNumElements() < 8)) { VecVT = MVT::v16i1; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, DAG.getUNDEF(VecVT), Vec, DAG.getIntPtrConstant(0, dl)); } // Use kshiftr instruction to move to the lower element. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec, DAG.getIntPtrConstant(0, dl)); } SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG, Subtarget); if (!isa(Idx)) { // Its more profitable to go through memory (1 cycles throughput) // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) // IACA tool was used to get performance estimation // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) // // example : extractelement <16 x i8> %a, i32 %i // // Block Throughput: 3.00 Cycles // Throughput Bottleneck: Port5 // // | Num Of | Ports pressure in cycles | | // | Uops | 0 - DV | 5 | 6 | 7 | | // --------------------------------------------- // | 1 | | 1.0 | | | CP | vmovd xmm1, edi // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0 // Total Num Of Uops: 4 // // // Block Throughput: 1.00 Cycles // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4 // // | | Ports pressure in cycles | | // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | // --------------------------------------------------------- // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] // Total Num Of Uops: 4 return SDValue(); } unsigned IdxVal = cast(Idx)->getZExtValue(); // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. if (VecVT.is256BitVector() || VecVT.is512BitVector()) { // Get the 128-bit vector. Vec = extract128BitVector(Vec, IdxVal, DAG, dl); MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 // this can be done with a mask. IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getConstant(IdxVal, dl, MVT::i32)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); MVT VT = Op.getSimpleValueType(); if (VT.getSizeInBits() == 16) { // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless // we're going to zero extend the register or fold the store (SSE41 only). if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) && !(Subtarget.hasSSE41() && MayFoldIntoStore(Op))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); // Transform it so it match pextrw which produces a 32-bit result. SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (Subtarget.hasSSE41()) if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; // TODO: We only extract a single element from v16i8, we can probably afford // to be more aggressive here before using the default approach of spilling to // stack. if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) { // Extract either the lowest i32 or any i16, and extract the sub-byte. int DWordIdx = IdxVal / 4; if (DWordIdx == 0) { SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), DAG.getIntPtrConstant(DWordIdx, dl)); int ShiftVal = (IdxVal % 4) * 8; if (ShiftVal != 0) Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, DAG.getConstant(ShiftVal, dl, MVT::i32)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } int WordIdx = IdxVal / 2; SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, DAG.getBitcast(MVT::v8i16, Vec), DAG.getIntPtrConstant(WordIdx, dl)); int ShiftVal = (IdxVal % 2) * 8; if (ShiftVal != 0) Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, DAG.getConstant(ShiftVal, dl, MVT::i16)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } if (VT.getSizeInBits() == 32) { if (IdxVal == 0) return Op; // SHUFPS the element to the lowest double word, then movss. int Mask[4] = { static_cast(IdxVal), -1, -1, -1 }; Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() == 64) { // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. if (IdxVal == 0) return Op; // UNPCKHPD the element to the lowest double word, then movsd. // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } return SDValue(); } /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue Elt = Op.getOperand(1); SDValue Idx = Op.getOperand(2); MVT VecVT = Vec.getSimpleValueType(); if (!isa(Idx)) { // Non constant index. Extend source and destination, // insert element and then truncate the result. unsigned NumElts = VecVT.getVectorNumElements(); MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); } unsigned IdxVal = cast(Idx)->getZExtValue(); unsigned NumElems = VecVT.getVectorNumElements(); // If the kshift instructions of the correct width aren't natively supported // then we need to promote the vector to the native size to get the correct // zeroing behavior. if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) { // Need to promote to v16i1, do the insert, then extract back. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, DAG.getUNDEF(MVT::v16i1), Vec, DAG.getIntPtrConstant(0, dl)); Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op, DAG.getIntPtrConstant(0, dl)); } SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); if (Vec.isUndef()) { if (IdxVal) EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); return EltInVec; } // Insertion of one bit into first position if (IdxVal == 0 ) { // Clean top bits of vector. EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(NumElems - 1, dl, MVT::i8)); EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec, DAG.getConstant(NumElems - 1, dl, MVT::i8)); // Clean the first bit in source vector. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } // Insertion of one bit into last position if (IdxVal == NumElems - 1) { // Move the bit to the last position inside the vector. EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); // Clean the last bit in the source vector. Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } // Move the current value of the bit to be replace to bit 0. SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(IdxVal, dl, MVT::i8)); // Xor with the new bit. Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec); // Shift to MSB, filling bottom bits with 0. Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged, DAG.getConstant(NumElems - 1, dl, MVT::i8)); // Shift to the final position, filling upper bits with 0. Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged, DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8)); // Xor with original vector to cancel out the original bit value that's still // present. return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec); } SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG, Subtarget); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); if (!isa(N2)) return SDValue(); auto *N2C = cast(N2); unsigned IdxVal = N2C->getZExtValue(); bool IsZeroElt = X86::isZeroNode(N1); bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); // If we are inserting a element, see if we can do this more efficiently with // a blend shuffle with a rematerializable vector than a costly integer // insertion. if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && 16 <= EltVT.getSizeInBits()) { SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) BlendMask.push_back(i == IdxVal ? i + NumElts : i); SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) : getOnesVector(VT, DAG, dl); return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { // With a 256-bit vector, we can insert into the zero element efficiently // using a blend if we have AVX or AVX2 and the right data type. if (VT.is256BitVector() && IdxVal == 0) { // TODO: It is worthwhile to cast integer to floating point and back // and incur a domain crossing penalty if that's what we'll end up // doing anyway after extracting to a 128-bit vector. if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || (Subtarget.hasAVX2() && EltVT == MVT::i32)) { SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); N2 = DAG.getIntPtrConstant(1, dl); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); } } // Get the desired 128-bit vector chunk. SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); assert(isPowerOf2_32(NumEltsIn128)); // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, dl, MVT::i32)); // Insert the changed part back into the bigger vector return insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. SSE41 required for pinsrb. if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { unsigned Opc; if (VT == MVT::v8i16) { assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW"); Opc = X86ISD::PINSRW; } else { assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector"); assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB"); Opc = X86ISD::PINSRB; } if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(Opc, dl, VT, N0, N1, N2); } if (Subtarget.hasSSE41()) { if (EltVT == MVT::f32) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into // these bits. For example (insert (extract, 3), 2) could be matched by // putting the '3' into bits [7:6] of X86ISD::INSERTPS. // Bits [5:4] of the constant are the destination select. This is the // value of the incoming immediate. // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather // than an insertps. Blends are simpler operations in hardware and so // will always have equal or better performance than insertps. // But if optimizing for size and there's a load folding opportunity, // generate insertps because blendps does not have a 32-bit memory // operand form. N2 = DAG.getIntPtrConstant(1, dl); N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); } N2 = DAG.getIntPtrConstant(IdxVal << 4, dl); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); } // PINSR* works with constant index. if (EltVT == MVT::i32 || EltVT == MVT::i64) return Op; } return SDValue(); } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); // It's always cheaper to replace a xor+movd with xorps and simplifies further // combines. if (X86::isZeroNode(Op.getOperand(0))) return getZeroVector(OpVT, Subtarget, DAG, dl); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. unsigned SizeFactor = OpVT.getSizeInBits() / 128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } assert(OpVT.is128BitVector() && "Expected an SSE type!"); // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. if (OpVT == MVT::v4i32) return Op; SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); return DAG.getBitcast( OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1); return insert1BitVector(Op, DAG, Subtarget); } // Returns the appropriate wrapper opcode for a global reference. unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const { // References to absolute symbols are never PC-relative. if (GV && GV->isAbsoluteSymbolRef()) return X86ISD::Wrapper; CodeModel::Model M = getTargetMachine().getCodeModel(); if (Subtarget.isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) return X86ISD::WrapperRIP; return X86ISD::Wrapper; } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOV32ri. SDValue X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetConstantPool( CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); SDLoc DL(CP); Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } return Result; } SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); SDLoc DL(JT); Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); return Result; } SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { const char *Sym = cast(Op)->getSymbol(); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); SDLoc DL(Op); Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isPositionIndependent() && !Subtarget.is64Bit()) { Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } // For symbols that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } SDValue X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget.classifyBlockAddressReference(); const BlockAddress *BA = cast(Op)->getBlockAddress(); int64_t Offset = cast(Op)->getOffset(); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } return Result; } SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl, int64_t Offset, SelectionDAG &DAG) const { // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. unsigned char OpFlags = Subtarget.classifyGlobalReference(GV); CodeModel::Model M = DAG.getTarget().getCodeModel(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { // A direct static reference to a global. Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); Offset = 0; } else { Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); } Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. if (Offset != 0) Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, DAG.getConstant(Offset, dl, PtrVT)); return Result; } SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = cast(Op)->getGlobal(); int64_t Offset = cast(Op)->getOffset(); return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); } static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDLoc dl(GA); SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR : X86ISD::TLSADDR; if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } else { SDValue Ops[] = { Chain, TGA }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. MFI.setAdjustsStack(true); MFI.setHasCalls(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { SDValue InFlag; SDLoc dl(GA); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSGD); } static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool is64Bit) { SDLoc dl(GA); // Get the start address of the TLS block for this module. X86MachineFunctionInfo *MFI = DAG.getMachineFunction() .getInfo(); MFI->incNumLocalDynamicTLSAccesses(); SDValue Base; if (is64Bit) { Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InFlag; SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSLDM, /*LocalDynamic=*/true); } // Note: the CleanupLocalDynamicTLSPass will remove redundant computations // of Base. // Build x@dtpoff. unsigned char OperandFlags = X86II::MO_DTPOFF; unsigned WrapperKind = X86ISD::Wrapper; SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); // Add x@dtpoff with the base. return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); } // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC) { SDLoc dl(GA); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), is64Bit ? 257 : 256)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr)); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is // initialexec. unsigned WrapperKind = X86ISD::Wrapper; if (model == TLSModel::LocalExec) { OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; } else if (model == TLSModel::InitialExec) { if (is64Bit) { OperandFlags = X86II::MO_GOTTPOFF; WrapperKind = X86ISD::WrapperRIP; } else { OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; } } else { llvm_unreachable("Unexpected model"); } // emit "addl x@ntpoff,%eax" (local exec) // or "addl x@indntpoff,%eax" (initial exec) // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) { if (isPIC && !is64Bit) { Offset = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, MachinePointerInfo::getGOT(DAG.getMachineFunction())); } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().Options.EmulatedTLS) return LowerToTLSEmulatedModel(GA, DAG); const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); bool PositionIndependent = isPositionIndependent(); if (Subtarget.isTargetELF()) { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: if (Subtarget.is64Bit()) return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); case TLSModel::LocalDynamic: return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), PositionIndependent); } llvm_unreachable("Unknown TLS model."); } if (Subtarget.isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ? X86ISD::WrapperRIP : X86ISD::Wrapper; // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. bool PIC32 = PositionIndependent && !Subtarget.is64Bit(); if (PIC32) OpFlag = X86II::MO_TLVP_PIC_BASE; else OpFlag = X86II::MO_TLVP; SDLoc DL(Op); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC32, the address is actually $g + Offset. if (PIC32) Offset = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); // Lowering the machine isd will make sure everything is in the right // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), DAG.getIntPtrConstant(0, DL, true), Chain.getValue(1), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); // And our return value (tls address) is in the standard call return value // location. unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } if (Subtarget.isTargetKnownWindowsMSVC() || Subtarget.isTargetWindowsItanium() || Subtarget.isTargetWindowsGNU()) { // Just use the implicit TLS architecture // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage // ; from TEB // mov ecx, dword [rel _tls_index]: Load index (from C runtime) // mov rcx, qword [rdx+rcx*8] // mov eax, .tls$:tlsvar // [rax+rcx] contains the address // Windows 64bit: gs:0x58 // Windows 32bit: fs:__tls_array SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly // use its literal value of 0x2C. Value *Ptr = Constant::getNullValue(Subtarget.is64Bit() ? Type::getInt8PtrTy(*DAG.getContext(), 256) : Type::getInt32PtrTy(*DAG.getContext(), 257)); SDValue TlsArray = Subtarget.is64Bit() ? DAG.getIntPtrConstant(0x58, dl) : (Subtarget.isTargetWindowsGNU() ? DAG.getIntPtrConstant(0x2C, dl) : DAG.getExternalSymbol("_tls_array", PtrVT)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr)); SDValue res; if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { res = ThreadPointer; } else { // Load the _tls_index variable SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); if (Subtarget.is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, MachinePointerInfo(), MVT::i32); else IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo()); auto &DL = DAG.getDataLayout(); SDValue Scale = DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT); IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); } res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo()); // Get the offset of start of .tls section SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), X86II::MO_SECREL); SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); } llvm_unreachable("TLS not implemented for this target."); } /// Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); MVT VT = Op.getSimpleValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away // during isel. SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits - 1, dl, MVT::i8)); SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, dl, MVT::i8)) : DAG.getConstant(0, dl, VT); SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } // If the shift amount is larger or equal than the width of a part we can't // rely on the results of shld/shrd. Insert a test and select the appropriate // values for large shift amounts. SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, dl, MVT::i8)); SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, AndNode, DAG.getConstant(0, dl, MVT::i8)); SDValue Hi, Lo; SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; if (Op.getOpcode() == ISD::SHL_PARTS) { Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } else { Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); } SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); } if (SrcVT.getVectorElementType() == MVT::i1) { - if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT)) - return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src)); + if (SrcVT == MVT::v2i1) { + // For v2i1, we need to widen to v4i1 first. + assert(VT == MVT::v2f64 && "Unexpected type"); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src, + DAG.getUNDEF(MVT::v2i1)); + return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(), + DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src)); + } + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); } return SDValue(); } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); // These are really Legal; return the operand so the caller accepts it as // Legal. if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) return Op; if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && Subtarget.is64Bit()) { return Op; } SDValue ValueToStore = Op.getOperand(0); if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD SDLoc DL(Op); SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); if (useSSE) Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); unsigned ByteSize = SrcVT.getSizeInBits()/8; FrameIndexSDNode *FI = dyn_cast(StackSlot); MachineMemOperand *MMO; if (FI) { int SSFI = FI->getIndex(); MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { MMO = cast(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); } SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, Tys, Ops, SrcVT, MMO); if (useSSE) { Chain = Result.getValue(1); SDValue InFlag = Result.getValue(2); // FIXME: Currently the FST is flagged to the FILD_FLAG. This // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); unsigned SSFISize = Op.getValueSizeInBits()/8; int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag }; MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, Ops, Op.getValueType(), MMO); Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); } return Result; } /// 64-bit unsigned integer to double expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const { // This algorithm is not obvious. Here it is what we're trying to output: /* movq %rax, %xmm0 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } #ifdef __SSE3__ haddpd %xmm0, %xmm0 #else pshufd $0x4e, %xmm0, %xmm1 addpd %xmm1, %xmm0 #endif */ SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); // Build some magic constants. static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); SmallVector CV1; CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)))); CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; if (Subtarget.hasSSE3()) { // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub); SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1}); Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, DAG.getBitcast(MVT::v2f64, Shuffle), Sub); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0, dl)); } /// 32-bit unsigned integer to float expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(0)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Load), DAG.getIntPtrConstant(0, dl)); // Or the load with the bias. SDValue Or = DAG.getNode( ISD::OR, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); // Handle final rounding. MVT DestVT = Op.getSimpleValueType(); if (DestVT.bitsLT(MVT::f64)) return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, DAG.getIntPtrConstant(0, dl)); if (DestVT.bitsGT(MVT::f64)) return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); // Handle final rounding. return Sub; } static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDLoc &DL) { if (Op.getSimpleValueType() != MVT::v2f64) return SDValue(); SDValue N0 = Op.getOperand(0); assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); // Legalize to v4i32 type. N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, DAG.getUNDEF(MVT::v2i32)); if (Subtarget.hasAVX512()) return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT, // but using v2i32 to v2f64 with X86ISD::CVTSI2P. SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32); SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); // Two to the power of half-word-size. SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64); // Clear upper part of LO, lower HI. SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask); SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI); fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW); SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO); // Add the two halves. return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO); } static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // The algorithm is the following: // #ifdef __SSE4_1__ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); // #else // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; // uint4 hi = (v >> 16) | (uint4) 0x53000000; // #endif // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; // We shouldn't use it when unsafe-fp-math is enabled though: we might later // reassociate the two FADDs, and if we do that, the algorithm fails // spectacularly (PR24512). // FIXME: If we ever have some kind of Machine FMF, this should be marked // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because // there's also the MachineCombiner reassociations happening on Machine IR. if (DAG.getTarget().Options.UnsafeFPMath) return SDValue(); SDLoc DL(Op); SDValue V = Op->getOperand(0); MVT VecIntVT = V.getSimpleValueType(); bool Is128 = VecIntVT == MVT::v4i32; MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, // abort early. if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && "Unsupported custom type"); // In the #idef/#else code, we have in common: // - The vector of constants: // -- 0x4b000000 // -- 0x53000000 // - A shift: // -- v >> 16 // Create the splat vector for 0x4b000000. SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT); // Create the splat vector for 0x53000000. SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT); // Create the right shift. SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT); SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); SDValue Low, High; if (Subtarget.hasSSE41()) { MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); } else { SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); // uint4 hi = (v >> 16) | (uint4) 0x53000000; High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); } // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). SDValue VecCstFAdd = DAG.getConstantFP( APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? SDValue FHigh = DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); // return (float4) lo + fhi; SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); if (SrcVT.getVectorElementType() == MVT::i1) { - if (SrcVT == MVT::v2i1) - return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0)); + if (SrcVT == MVT::v2i1) { + // For v2i1, we need to widen to v4i1 first. + assert(Op.getValueType() == MVT::v2f64 && "Unexpected type"); + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0, + DAG.getUNDEF(MVT::v2i1)); + return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64, + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0)); + } + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0)); } switch (SrcVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); case MVT::v2i32: return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl); case MVT::v4i32: case MVT::v8i32: assert(!Subtarget.hasAVX512()); return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); } } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { // Conversions from unsigned i32 to f32/f64 are legal, // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. return Op; } if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo()); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo()); SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); return Fild; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue ValueToStore = Op.getOperand(0); if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo()); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) int SSFI = cast(StackSlot)->getIndex(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, /* Alignment = */ 4); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); } // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), // just return an pair. // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 // to i16, i32 or i64, and we lower it to a legal sequence. // If lowered to the final integer result we return a pair. // Otherwise we lower it to a sequence ending with a FIST, return a // pair, and the caller is responsible for loading // the final integer result from StackSlot. std::pair X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); EVT TheVT = Op.getOperand(0).getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { // f16 must be promoted before using the lowering in this routine. // fp128 does not use this lowering. return std::make_pair(SDValue(), SDValue()); } // If using FIST to compute an unsigned i64, we'll need some fixup // to handle values above the maximum signed i64. A FIST is always // used for the 32-bit subtarget, but also for f80 on a 64-bit target. bool UnsignedFixup = !IsSigned && DstTy == MVT::i64 && (!Subtarget.is64Bit() || !isScalarFPTypeInSSEReg(TheVT)); if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); // These are really Legal. if (DstTy == MVT::i32 && isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); if (Subtarget.is64Bit() && DstTy == MVT::i64 && isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); // We lower FP->int64 into FISTP64 followed by a load from a temporary // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); unsigned Opc; switch (DstTy.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. if (UnsignedFixup) { // // Conversion to unsigned i64 is implemented with a select, // depending on whether the source value fits in the range // of a signed i64. Let Thresh be the FP equivalent of // 0x8000000000000000ULL. // // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); // Fist-to-mem64 FistSrc // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent // to XOR'ing the high 32 bits with Adjust. // // Being a power of 2, Thresh is exactly representable in all FP formats. // For X87 we'd like to use the smallest FP type for this constant, but // for DAG type consistency we have to match the FP operand type. APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000)); LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; bool LosesInfo = false; if (TheVT == MVT::f64) // The rounding mode is irrelevant as the conversion should be exact. Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &LosesInfo); else if (TheVT == MVT::f80) Status = Thresh.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &LosesInfo); assert(Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"); SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); SDValue Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), Value, ThreshVal, ISD::SETLT); Adjust = DAG.getSelect(DL, MVT::i32, Cmp, DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(0x80000000, DL, MVT::i32)); SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), Value, ThreshVal, ISD::SETLT); Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); } // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, MachinePointerInfo::getFixedStack(MF, SSFI)); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(TheVT) }; MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), MachineMemOperand::MOLoad, MemSize, MemSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, PtrVT); } MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), MachineMemOperand::MOStore, MemSize, MemSize); if (UnsignedFixup) { // Insert the FIST, load its result as two i32's, // and XOR the high i32 with Adjust. SDValue FistOps[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), FistOps, DstTy, MMO); SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo()); SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL); SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo()); High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); if (Subtarget.is64Bit()) { // Join High32 and Low32 into a 64-bit result. // (High32 << 32) | Low32 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, DAG.getConstant(32, DL, MVT::i8)); SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); return std::make_pair(Result, SDValue()); } SDValue ResultOps[] = { Low32, High32 }; SDValue pair = IsReplace ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) : DAG.getMergeValues(ResultOps, DL); return std::make_pair(pair, SDValue()); } else { // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); } } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16) && (VT != MVT::v16i16 || InVT != MVT::v16i8) && (VT != MVT::v8i64 || InVT != MVT::v8i32) && (VT != MVT::v8i64 || InVT != MVT::v8i16) && (VT != MVT::v16i32 || InVT != MVT::v16i16) && (VT != MVT::v16i32 || InVT != MVT::v16i8) && (VT != MVT::v32i16 || InVT != MVT::v32i8)) return SDValue(); if (Subtarget.hasInt256()) return DAG.getNode(X86ISD::VZEXT, dl, VT, In); // Optimize vectors in AVX mode: // // v8i16 -> v8i32 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. // Concat upper and lower parts. // // v4i32 -> v4i64 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); SDValue Undef = DAG.getUNDEF(InVT); bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); OpLo = DAG.getBitcast(HVT, OpLo); OpHi = DAG.getBitcast(HVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); SDLoc DL(Op); unsigned NumElts = VT.getVectorNumElements(); // Extend VT if the scalar type is v8/v16 and BWI is not supported. MVT ExtVT = VT; if (!Subtarget.hasBWI() && (VT.getVectorElementType().getSizeInBits() <= 16)) ExtVT = MVT::getVectorVT(MVT::i32, NumElts); // Widen to 512-bits if VLX is not supported. MVT WideVT = ExtVT; if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { NumElts *= 512 / ExtVT.getSizeInBits(); InVT = MVT::getVectorVT(MVT::i1, NumElts); In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In, DAG.getIntPtrConstant(0, DL)); WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); } SDValue One = DAG.getConstant(1, DL, WideVT); SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL); SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero); // Truncate if we had to extend i16/i8 above. if (VT != ExtVT) { WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); SelectedVal = DAG.getNode(X86ISD::VTRUNC, DL, WideVT, SelectedVal); } // Extract back to 128/256-bit if we widened. if (WideVT != VT) SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal, DAG.getIntPtrConstant(0, DL)); return SelectedVal; } static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); if (SVT.getVectorElementType() == MVT::i1) return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG); if (Subtarget.hasFp256()) if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() || Op.getSimpleValueType().getVectorNumElements() != SVT.getVectorNumElements()); return SDValue(); } /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS. /// It makes use of the fact that vectors with enough leading sign/zero bits /// prevent the PACKSS/PACKUS from saturating the results. /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates /// within each 128-bit lane. static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && "Unexpected PACK opcode"); // Requires SSE2 but AVX512 has fast truncate. if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) return SDValue(); EVT SrcVT = In.getValueType(); // No truncation required, we might get here due to recursive calls. if (SrcVT == DstVT) return In; // We only support vector truncation to 128bits or greater from a // 256bits or greater source. unsigned DstSizeInBits = DstVT.getSizeInBits(); unsigned SrcSizeInBits = SrcVT.getSizeInBits(); if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0) return SDValue(); LLVMContext &Ctx = *DAG.getContext(); unsigned NumElems = SrcVT.getVectorNumElements(); assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation"); assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation"); EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2); // Extract lower/upper subvectors. unsigned NumSubElts = NumElems / 2; SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2); SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2); // Pack to the largest type possible: // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. EVT InVT = MVT::i16, OutVT = MVT::i8; if (DstVT.getScalarSizeInBits() > 8 && (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) { InVT = MVT::i32; OutVT = MVT::i16; } unsigned SubSizeInBits = SrcSizeInBits / 2; InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits()); // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors. if (SrcVT.is256BitVector()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); return DAG.getBitcast(DstVT, Res); } // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors. // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK). if (SrcVT.is512BitVector() && Subtarget.hasInt256()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). Res = DAG.getBitcast(MVT::v4i64, Res); Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3}); if (DstVT.is256BitVector()) return DAG.getBitcast(DstVT, Res); // If 512bit -> 128bit truncate another stage. EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); Res = DAG.getBitcast(PackedVT, Res); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } // Recursively pack lower/upper subvectors, concat result and pack again. assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater"); EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts); Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget); Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget); PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."); // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q. unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; if (InVT.getScalarSizeInBits() <= 16) { if (Subtarget.hasBWI()) { // legal, will go to VPMOVB2M, VPMOVW2M // Shift packed bytes not supported natively, bitcast to word MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In), DAG.getConstant(ShiftInx, DL, ExtVT)); ShiftNode = DAG.getBitcast(InVT, ShiftNode); return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); } // Use TESTD/Q, extended vector to packed dword/qword. assert((InVT.is256BitVector() || InVT.is128BitVector()) && "Unexpected vector type."); unsigned NumElts = InVT.getVectorNumElements(); MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts); MVT ExtVT = MVT::getVectorVT(EltVT, NumElts); In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; ShiftInx = InVT.getScalarSizeInBits() - 1; } SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, DAG.getConstant(ShiftInx, DL, InVT)); return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); unsigned InNumEltBits = InVT.getScalarSizeInBits(); assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget.hasAVX512()) { // word to byte only under BWI if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8 return DAG.getNode(X86ISD::VTRUNC, DL, VT, getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG)); return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } // Truncate with PACKSS if we are truncating a vector with sign-bits that // extend all the way to the packed/truncated value. unsigned NumPackedBits = std::min(VT.getScalarSizeInBits(), 16); if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In)) if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) return V; // Truncate with PACKUS if we are truncating a vector with leading zero bits // that extend all the way to the packed/truncated value. // Pre-SSE41 we can only use PACKUSWB. KnownBits Known; DAG.computeKnownBits(In, Known); NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros()) if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget)) return V; if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v8i32, In); In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0, DL)); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(2, DL)); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); static const int ShufMask[] = {0, 2, 4, 6}; return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { In = DAG.getBitcast(MVT::v32i8, In); // The PSHUFB mask: static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 16, 17, 20, 21, 24, 25, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1 }; In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); static const int ShufMask2[] = {0, 2, -1, -1}; In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(VT, In); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(4, DL)); OpLo = DAG.getBitcast(MVT::v16i8, OpLo); OpHi = DAG.getBitcast(MVT::v16i8, OpHi); // The PSHUFB mask: static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1}; OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1); OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); // The MOVLHPS Mask: static const int ShufMask2[] = {0, 1, 4, 5}; SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); return DAG.getBitcast(MVT::v8i16, res); } // Handle truncation of V256 to V128 using shuffles. if (!VT.is128BitVector() || !InVT.is256BitVector()) return SDValue(); assert(Subtarget.hasFp256() && "256-bit vector without AVX!"); unsigned NumElems = VT.getVectorNumElements(); MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); SmallVector MaskVec(NumElems * 2, -1); // Prepare truncation shuffle mask for (unsigned i = 0; i != NumElems; ++i) MaskVec[i] = i * 2; In = DAG.getBitcast(NVT, In); SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, DAG.getIntPtrConstant(0, DL)); } SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; MVT VT = Op.getSimpleValueType(); if (VT.isVector()) { assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); SDValue Src = Op.getOperand(0); SDLoc dl(Op); if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32))); } return SDValue(); } assert(!VT.isVector()); std::pair Vals = FP_TO_INTHelper(Op, DAG, IsSigned, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. if (!FIST.getNode()) return Op; if (StackSlot.getNode()) // Load the result. return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo()); // The node is the result. return FIST; } static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); return DAG.getNode(X86ISD::VFPEXT, DL, VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT))); } /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."); bool IsFABS = (Op.getOpcode() == ISD::FABS); // If this is a FABS and it has an FNEG user, bail out to fold the combination // into an FNABS. We'll lower the FABS after that if it is still in use. if (IsFABS) for (SDNode *User : Op->uses()) if (User->getOpcode() == ISD::FNEG) return Op; SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); bool IsF128 = (VT == MVT::f128); // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. MVT LogicVT; MVT EltVT; if (VT.isVector()) { LogicVT = VT; EltVT = VT.getVectorElementType(); } else if (IsF128) { // SSE instructions are used for optimized f128 logical operations. LogicVT = MVT::f128; EltVT = VT; } else { // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. // Using a 16-byte mask allows folding the load of the mask with // the logic op, so it can save (~4 bytes) on code size. LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; EltVT = VT; } unsigned EltBits = EltVT.getSizeInBits(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits); const fltSemantics &Sem = EltVT == MVT::f64 ? APFloat::IEEEdouble() : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle()); SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); unsigned LogicOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, // and extract the scalar result back out. Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDValue Mag = Op.getOperand(0); SDValue Sign = Op.getOperand(1); SDLoc dl(Op); // If the sign operand is smaller, extend it first. MVT VT = Op.getSimpleValueType(); if (Sign.getSimpleValueType().bitsLT(VT)) Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign); // And if it is bigger, shrink it first. if (Sign.getSimpleValueType().bitsGT(VT)) Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl)); // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. bool IsF128 = (VT == MVT::f128); assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"); MVT EltVT = VT.getScalarType(); const fltSemantics &Sem = EltVT == MVT::f64 ? APFloat::IEEEdouble() : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle()); // Perform all scalar logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. // TODO: This isn't necessary. If we used scalar types, we might avoid some // unnecessary splats, but we might miss load folding opportunities. Should // this decision be based on OptimizeForSize? bool IsFakeVector = !VT.isVector() && !IsF128; MVT LogicVT = VT; if (IsFakeVector) LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; // The mask constants are automatically splatted for vector types. unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue SignMask = DAG.getConstantFP( APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT); SDValue MagMask = DAG.getConstantFP( APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT); // First, clear all bits but the sign bit from the second operand (sign). if (IsFakeVector) Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask); // Next, clear the sign bit from the first operand (magnitude). // TODO: If we had general constant folding for FP logic ops, this check // wouldn't be necessary. SDValue MagBits; if (ConstantFPSDNode *Op0CN = dyn_cast(Mag)) { APFloat APF = Op0CN->getValueAPF(); APF.clearSign(); MagBits = DAG.getConstantFP(APF, dl, LogicVT); } else { // If the magnitude operand wasn't a constant, we need to AND out the sign. if (IsFakeVector) Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag); MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask); } // OR the magnitude value with the sign bit. SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit); return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT OpVT = N0.getSimpleValueType(); assert((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"); // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1). MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0); Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res); Res = DAG.getZExtOrTrunc(Res, dl, VT); Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT)); return Res; } // Check whether an OR'd tree is PTEST-able. static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); if (!Subtarget.hasSSE41()) return SDValue(); if (!Op->hasOneUse()) return SDValue(); SDNode *N = Op.getNode(); SDLoc DL(N); SmallVector Opnds; DenseMap VecInMap; SmallVector VecIns; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to // test all 0s. Opnds.push_back(N->getOperand(0)); Opnds.push_back(N->getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { SmallVectorImpl::const_iterator I = Opnds.begin() + Slot; // BFS traverse all OR'd operands. if (I->getOpcode() == ISD::OR) { Opnds.push_back(I->getOperand(0)); Opnds.push_back(I->getOperand(1)); // Re-evaluate the number of nodes to be traversed. e += 2; // 2 more nodes (LHS and RHS) are pushed. continue; } // Quit if a non-EXTRACT_VECTOR_ELT if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // Quit if without a constant index. SDValue Idx = I->getOperand(1); if (!isa(Idx)) return SDValue(); SDValue ExtractedFromVec = I->getOperand(0); DenseMap::iterator M = VecInMap.find(ExtractedFromVec); if (M == VecInMap.end()) { VT = ExtractedFromVec.getValueType(); // Quit if not 128/256-bit vector. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); // Quit if not the same type. if (VecInMap.begin() != VecInMap.end() && VT != VecInMap.begin()->first.getValueType()) return SDValue(); M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; VecIns.push_back(ExtractedFromVec); } M->second |= 1U << cast(Idx)->getZExtValue(); } assert((VT.is128BitVector() || VT.is256BitVector()) && "Not extracted from 128-/256-bit vector."); unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; for (DenseMap::const_iterator I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { // Quit if not all elements are used. if (I->second != FullMask) return SDValue(); } MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. for (unsigned i = 0, e = VecIns.size(); i < e; ++i) VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); // If more than one full vector is evaluated, OR them first before PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { // Each iteration will OR 2 nodes and append the result until there is only // 1 node left, i.e. the final OR'd value of all vectors. SDValue LHS = VecIns[Slot]; SDValue RHS = VecIns[Slot + 1]; VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// \brief return true if \c Op has a use that doesn't just read flags. static bool hasNonFlagsUse(SDValue Op) { for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; ++UI) { SDNode *User = *UI; unsigned UOpNo = UI.getOperandNo(); if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { // Look pass truncate. UOpNo = User->use_begin().getOperandNo(); User = *User->use_begin(); } if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) return true; } return false; } // Emit KTEST instruction for bit vectors on AVX-512 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Op.getOpcode() == ISD::BITCAST) { auto hasKTEST = [&](MVT VT) { unsigned SizeInBits = VT.getSizeInBits(); return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) || (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64)); }; SDValue Op0 = Op.getOperand(0); MVT Op0VT = Op0.getValueType().getSimpleVT(); if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 && hasKTEST(Op0VT)) return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0); } return SDValue(); } /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::i1) { SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp, DAG.getConstant(0, dl, MVT::i8)); } // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; bool NeedOF = false; switch (X86CC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: NeedCF = true; break; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: case X86::COND_O: case X86::COND_NO: { // Check if we really need to set the // Overflow flag. If NoSignedWrap is present // that is not actually needed. switch (Op->getOpcode()) { case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::SHL: if (Op.getNode()->getFlags().hasNoSignedWrap()) break; LLVM_FALLTHROUGH; default: NeedOF = true; break; } break; } } // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { // Emit KTEST for bit vectors if (auto Node = EmitKTEST(Op, DAG, Subtarget)) return Node; // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } unsigned Opcode = 0; unsigned NumOperands = 0; // Truncate operations may prevent the merge of the SETCC instruction // and the arithmetic instruction before it. Attempt to truncate the operands // of the arithmetic instruction and use a reduced bit-width instruction. bool NeedTruncation = false; SDValue ArithOp = Op; if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { SDValue Arith = Op->getOperand(0); // Both the trunc and the arithmetic op need to have one user each. if (Arith->hasOneUse()) switch (Arith.getOpcode()) { default: break; case ISD::ADD: case ISD::SUB: case ISD::AND: case ISD::OR: case ISD::XOR: { NeedTruncation = true; ArithOp = Arith; } } } // Sometimes flags can be set either with an AND or with an SRL/SHL // instruction. SRL/SHL variant should be preferred for masks longer than this // number of bits. const int ShiftToAndMaxMaskWidth = 32; const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE); // NOTICE: In the code below we use ArithOp to hold the arithmetic operation // which may be the result of a CAST. We use the variable 'Op', which is the // non-casted variable when we check for possible users. switch (ArithOp.getOpcode()) { case ISD::ADD: // We only want to rewrite this as a target-specific node with attached // flags if there is a reasonable chance of either using that to do custom // instructions selection that can fold some of the memory operands, or if // only the flags are used. If there are other uses, leave the node alone // and emit a test instruction. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC && UI->getOpcode() != ISD::STORE) goto default_case; if (auto *C = dyn_cast(ArithOp.getOperand(1))) { // An add of one will be selected as an INC. if (C->isOne() && (!Subtarget.slowIncDec() || DAG.getMachineFunction().getFunction().optForSize())) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. if (C->isAllOnesValue() && (!Subtarget.slowIncDec() || DAG.getMachineFunction().getFunction().optForSize())) { Opcode = X86ISD::DEC; NumOperands = 1; break; } } // Otherwise use a regular EFLAGS-setting add. Opcode = X86ISD::ADD; NumOperands = 2; break; case ISD::SHL: case ISD::SRL: // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. if (ZeroCheck && Op->hasOneUse() && isa(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); unsigned ShAmt = Op->getConstantOperandVal(1); if (ShAmt >= BitWidth) // Avoid undefined shifts. break; APInt Mask = ArithOp.getOpcode() == ISD::SRL ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth)) break; Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), DAG.getConstant(Mask, dl, VT)); } break; case ISD::AND: // If the primary 'and' result isn't used, don't bother using X86ISD::AND, // because a TEST instruction will be better. However, AND should be // preferred if the instruction can be combined into ANDN. if (!hasNonFlagsUse(Op)) { SDValue Op0 = ArithOp->getOperand(0); SDValue Op1 = ArithOp->getOperand(1); EVT VT = ArithOp.getValueType(); bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1); bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64; bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI(); // If we cannot select an ANDN instruction, check if we can replace // AND+IMM64 with a shift before giving up. This is possible for masks // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag. if (!isProperAndn) { if (!ZeroCheck) break; assert(!isa(Op0) && "AND node isn't canonicalized"); auto *CN = dyn_cast(Op1); if (!CN) break; const APInt &Mask = CN->getAPIntValue(); if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth)) break; // Prefer TEST instruction. unsigned BitWidth = Mask.getBitWidth(); unsigned LeadingOnes = Mask.countLeadingOnes(); unsigned TrailingZeros = Mask.countTrailingZeros(); if (LeadingOnes + TrailingZeros == BitWidth) { assert(TrailingZeros < VT.getSizeInBits() && "Shift amount should be less than the type width"); MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy); Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt); break; } unsigned LeadingZeros = Mask.countLeadingZeros(); unsigned TrailingOnes = Mask.countTrailingOnes(); if (LeadingZeros + TrailingOnes == BitWidth) { assert(LeadingZeros < VT.getSizeInBits() && "Shift amount should be less than the type width"); MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy); Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt); break; } break; } } LLVM_FALLTHROUGH; case ISD::SUB: case ISD::OR: case ISD::XOR: // Similar to ISD::ADD above, check if the uses will preclude useful // lowering of the target-specific node. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC && UI->getOpcode() != ISD::STORE) goto default_case; // Otherwise use a regular EFLAGS-setting instruction. switch (ArithOp.getOpcode()) { default: llvm_unreachable("unexpected operator!"); case ISD::SUB: Opcode = X86ISD::SUB; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: { if (!NeedTruncation && ZeroCheck) { if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG)) return EFLAGS; } Opcode = X86ISD::OR; break; } } NumOperands = 2; break; case X86ISD::ADD: case X86ISD::SUB: case X86ISD::INC: case X86ISD::DEC: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: return SDValue(Op.getNode(), 1); default: default_case: break; } // If we found that truncation is beneficial, perform the truncation and // update 'Op'. if (NeedTruncation) { EVT VT = Op.getValueType(); SDValue WideVal = Op->getOperand(0); EVT WideVT = WideVal.getValueType(); unsigned ConvertedOp = 0; // Use a target machine opcode to prevent further DAGCombine // optimizations that may separate the arithmetic operations // from the setcc node. switch (WideVal.getOpcode()) { default: break; case ISD::ADD: ConvertedOp = X86ISD::ADD; break; case ISD::SUB: ConvertedOp = X86ISD::SUB; break; case ISD::AND: ConvertedOp = X86ISD::AND; break; case ISD::OR: ConvertedOp = X86ISD::OR; break; case ISD::XOR: ConvertedOp = X86ISD::XOR; break; } if (ConvertedOp) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); } } } if (Opcode == 0) { // Emit KTEST for bit vectors if (auto Node = EmitKTEST(Op, DAG, Subtarget)) return Node; // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector Ops(Op->op_begin(), Op->op_begin() + NumOperands); SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesWith(Op, New); return SDValue(New.getNode(), 1); } /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG) const { if (isNullConstant(Op1)) return EmitTest(Op0, X86CC, dl, DAG); assert(!(isa(Op1) && Op0.getValueType() == MVT::i1) && "Unexpected comparison operation for MVT::i1 operands"); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { // Only promote the compare up to I32 if it is a 16 bit operation // with an immediate. 16 bit immediates are to be avoided. if ((Op0.getValueType() == MVT::i16 && (isa(Op0) || isa(Op1))) && !DAG.getMachineFunction().getFunction().optForMinSize() && !Subtarget.isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); } // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); return SDValue(Sub.getNode(), 1); } return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); } /// Convert a comparison if required by the subtarget. SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const { // If the subtarget does not support the FUCOMI instruction, floating-point // comparisons have to be converted. if (Subtarget.hasCMov() || Cmp.getOpcode() != X86ISD::CMP || !Cmp.getOperand(0).getValueType().isFloatingPoint() || !Cmp.getOperand(1).getValueType().isFloatingPoint()) return Cmp; // The instruction selector will select an FUCOM instruction instead of // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence // build an SDNode sequence that transfers the result from FPSW into EFLAGS: // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) SDLoc dl(Cmp); SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); // Some 64-bit targets lack SAHF support, but they do support FCOMI. assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } /// Check if replacement of SQRT with RSQRT should be disabled. bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // We never want to use both SQRT and RSQRT instructions for the same input. if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) return false; if (VT.isVector()) return Subtarget.hasFastVectorFSQRT(); return Subtarget.hasFastScalarFSQRT(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const { EVT VT = Op.getValueType(); // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32 // after legalize types. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) || (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) || (VT == MVT::v8f32 && Subtarget.hasAVX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; UseOneConstNR = false; return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); } return SDValue(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const { EVT VT = Op.getValueType(); // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). // It is likely not profitable to do this for f64 because a double-precision // reciprocal estimate with refinement on x86 prior to FMA requires // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1()) || (VT == MVT::v8f32 && Subtarget.hasAVX())) { // Enable estimate codegen with 1 refinement step for vector division. // Scalar division estimates are disabled because they break too much // real-world code. These defaults are intended to match GCC behavior. if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified) return SDValue(); if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); } return SDValue(); } /// If we have at least two divisions that use the same divisor, convert to /// multiplication by a reciprocal. This may need to be adjusted for a given /// CPU if a division's cost is not at least twice the cost of a multiplication. /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } /// Helper for creating a X86ISD::SETCC node. static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, SelectionDAG &DAG) { return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(Cond, dl, MVT::i8), EFLAGS); } /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition /// according to equal/not-equal condition code \p CC. static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { // If Src is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because // the encoding for the i16 version is larger than the i32 version. // Also promote i16 to i32 for performance / code size reason. if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16) Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src); // See if we can use the 32-bit instruction instead of the 64-bit one for a // shorter encoding. Since the former takes the modulo 32 of BitNo and the // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is // known to be zero. if (Src.getValueType() == MVT::i64 && DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32))) Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); // If the operand types disagree, extend the shift amount to match. Since // BT ignores high bits (like shifts) we can use anyextend. if (Src.getValueType() != BitNo.getValueType()) BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo); SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo); X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; return getSETCC(Cond, BT, dl , DAG); } /// Result of 'and' is compared against zero. Change to a BT node if possible. static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { assert(And.getOpcode() == ISD::AND && "Expected AND node!"); SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) Op0 = Op0.getOperand(0); if (Op1.getOpcode() == ISD::TRUNCATE) Op1 = Op1.getOperand(0); SDValue LHS, RHS; if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { KnownBits Known; DAG.computeKnownBits(Op0, Known); if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth) return SDValue(); } LHS = Op1; RHS = Op0.getOperand(1); } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast(Op1); uint64_t AndRHSVal = AndRHS->getZExtValue(); SDValue AndLHS = Op0; if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { LHS = AndLHS.getOperand(0); RHS = AndLHS.getOperand(1); } // Use BT if the immediate can't be encoded in a TEST instruction. if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { LHS = AndLHS; RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType()); } } if (LHS.getNode()) return getBitTestCondition(LHS, RHS, CC, dl, DAG); return SDValue(); } /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask /// CMPs. static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1) { unsigned SSECC; bool Swap = false; // SSE Condition code mapping: // 0 - EQ // 1 - LT // 2 - LE // 3 - UNORD // 4 - NEQ // 5 - NLT // 6 - NLE // 7 - ORD switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; case ISD::SETOGT: case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLT: case ISD::SETOLT: SSECC = 1; break; case ISD::SETOGE: case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLE: case ISD::SETOLE: SSECC = 2; break; case ISD::SETUO: SSECC = 3; break; case ISD::SETUNE: case ISD::SETNE: SSECC = 4; break; case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGE: SSECC = 5; break; case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; case ISD::SETUEQ: SSECC = 8; break; case ISD::SETONE: SSECC = 12; break; } if (Swap) std::swap(Op0, Op1); return SSECC; } /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then /// concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); SDValue CC = Op.getOperand(2); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected type for boolean compare operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, DAG.getConstant(-1, dl, VT)); SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1, DAG.getConstant(-1, dl, VT)); switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETEQ: // (x == y) -> ~(x ^ y) return DAG.getNode(ISD::XOR, dl, VT, DAG.getNode(ISD::XOR, dl, VT, Op0, Op1), DAG.getConstant(-1, dl, VT)); case ISD::SETNE: // (x != y) -> (x ^ y) return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1); case ISD::SETUGT: case ISD::SETGT: // (x > y) -> (x & ~y) return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1); case ISD::SETULT: case ISD::SETLT: // (x < y) -> (~x & y) return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1); case ISD::SETULE: case ISD::SETLE: // (x <= y) -> (~x | y) return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1); case ISD::SETUGE: case ISD::SETGE: // (x >=y) -> (x | ~y) return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1); } } static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert(VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); unsigned Opc = 0; bool Unsigned = false; bool Swap = false; unsigned SSECC; switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: SSECC = 4; break; case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; case ISD::SETUGT: SSECC = 6; Unsigned = true; break; case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; case ISD::SETULT: SSECC = 1; Unsigned = true; break; case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH; case ISD::SETLE: SSECC = 2; break; } if (Swap) std::swap(Op0, Op1); // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM. if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) { SDValue A = peekThroughBitcasts(Op0); if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) && ISD::isBuildVectorAllZeros(Op1.getNode())) { MVT VT0 = Op0.getSimpleValueType(); SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0)); SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1)); return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM, dl, VT, RHS, LHS); } } if (Opc) return DAG.getNode(Opc, dl, VT, Op0, Op1); Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, dl, MVT::i8)); } /// \brief Try to turn a VSETULT into a VSETULE by modifying its second /// operand \p Op1. If non-trivial (for example because it's not constant) /// return an empty value. static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1, SelectionDAG &DAG) { BuildVectorSDNode *BV = dyn_cast(Op1.getNode()); if (!BV) return SDValue(); MVT VT = Op1.getSimpleValueType(); MVT EVT = VT.getVectorElementType(); unsigned n = VT.getVectorNumElements(); SmallVector ULTOp1; for (unsigned i = 0; i < n; ++i) { ConstantSDNode *Elt = dyn_cast(BV->getOperand(i)); if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT) return SDValue(); // Avoid underflow. APInt Val = Elt->getAPIntValue(); if (Val == 0) return SDValue(); ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT)); } return DAG.getBuildVector(VT, dl, ULTOp1); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); ISD::CondCode Cond = cast(CC)->get(); bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); if (isFP) { #ifndef NDEBUG MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif unsigned Opc; if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16); Opc = X86ISD::CMPM; } else { Opc = X86ISD::CMPP; // The SSE/AVX packed FP comparison nodes are defined with a // floating-point vector result that matches the operand type. This allows // them to work with an SSE1 target (integer vector types are not legal). VT = Op0.getSimpleValueType(); } // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), // emit two comparisons and a logic op to tie them together. SDValue Cmp; unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1); if (SSECC >= 8 && !Subtarget.hasAVX()) { // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; if (Cond == ISD::SETUEQ) { CC0 = 3; // UNORD CC1 = 0; // EQ CombineOpc = X86ISD::FOR; } else { assert(Cond == ISD::SETONE); CC0 = 7; // ORD CC1 = 4; // NEQ CombineOpc = X86ISD::FAND; } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC0, dl, MVT::i8)); SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC1, dl, MVT::i8)); Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { // Handle all other FP comparisons here. Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, dl, MVT::i8)); } // If this is SSE/AVX CMPP, bitcast the result back to integer to match the // result type of SETCC. The bitcast is expected to be optimized away // during combining/isel. if (Opc == X86ISD::CMPP) Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); return Cmp; } MVT VTOp0 = Op0.getSimpleValueType(); assert(VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"); assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); if (VT.is128BitVector() && VTOp0.is256BitVector()) { // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the // legalizer firstly checks if the first operand in input to the setcc has // a legal type. If so, then it promotes the return type to that same type. // Otherwise, the return type is promoted to the 'next legal type' which, // for a vector of MVT::i1 is always a 128-bit integer vector type. // // We reach this code only if the following two conditions are met: // 1. Both return type and operand type have been promoted to wider types // by the type legalizer. // 2. The original operand type has been promoted to a 256-bit vector. // // Note that condition 2. only applies for AVX targets. SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond); return DAG.getZExtOrTrunc(NewOp, dl, VT); } // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"); // Break 256-bit integer vector compare into smaller ones. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntVSETCC(Op, DAG); // Operands are boolean (vectors of i1) MVT OpVT = Op1.getSimpleValueType(); if (OpVT.getVectorElementType() == MVT::i1) return LowerBoolVSETCC_AVX512(Op, DAG); // The result is boolean, but operands are int/float if (VT.getVectorElementType() == MVT::i1) { // In AVX-512 architecture setcc returns mask with i1 elements, // But there is no compare instruction for i8 and i16 elements in KNL. // In this case use SSE compare bool UseAVX512Inst = (OpVT.is512BitVector() || OpVT.getScalarSizeInBits() >= 32 || (Subtarget.hasBWI() && Subtarget.hasVLX())); if (UseAVX512Inst) return LowerIntVSETCC_AVX512(Op, DAG); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); } // Lower using XOP integer comparisons. if ((VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) { // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; switch (Cond) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETULT: case ISD::SETLT: CmpMode = 0x00; break; case ISD::SETULE: case ISD::SETLE: CmpMode = 0x01; break; case ISD::SETUGT: case ISD::SETGT: CmpMode = 0x02; break; case ISD::SETUGE: case ISD::SETGE: CmpMode = 0x03; break; case ISD::SETEQ: CmpMode = 0x04; break; case ISD::SETNE: CmpMode = 0x05; break; } // Are we comparing unsigned or signed integers? unsigned Opc = ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CmpMode, dl, MVT::i8)); } // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. // Revert part of the simplifySetCCWithAnd combine, to avoid an invert. if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) { SDValue BC0 = peekThroughBitcasts(Op0); if (BC0.getOpcode() == ISD::AND) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits, false, false)) { if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) { Cond = ISD::SETEQ; Op1 = DAG.getBitcast(VT, BC0.getOperand(1)); } } } } // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ : X86ISD::PCMPGT; bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || Cond == ISD::SETGE || Cond == ISD::SETUGE; bool Invert = Cond == ISD::SETNE || (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); // If both operands are known non-negative, then an unsigned compare is the // same as a signed compare and there's no need to flip signbits. // TODO: We could check for more general simplifications here since we're // computing known bits. bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) && !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1)); // Special case: Use min/max operations for SETULE/SETUGE MVT VET = VT.getVectorElementType(); bool HasMinMax = (Subtarget.hasAVX512() && VET == MVT::i64) || (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) || (Subtarget.hasSSE2() && (VET == MVT::i8)); bool MinMax = false; if (HasMinMax) { switch (Cond) { default: break; case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; } if (MinMax) Swap = Invert = FlipSigns = false; } bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); bool Subus = false; if (!MinMax && HasSubus) { // As another special case, use PSUBUS[BW] when it's profitable. E.g. for // Op0 u<= Op1: // t = psubus Op0, Op1 // pcmpeq t, <0..0> switch (Cond) { default: break; case ISD::SETULT: { // If the comparison is against a constant we can turn this into a // setule. With psubus, setule does not require a swap. This is // beneficial because the constant in the register is no longer // destructed as the destination so it can be hoisted out of a loop. // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) break; if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) { Op1 = ULEOp1; Subus = true; Invert = false; Swap = false; } break; } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break; case ISD::SETULE: Subus = true; Invert = false; Swap = false; break; } if (Subus) { Opc = X86ISD::SUBUS; FlipSigns = false; } } if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) { assert(Subtarget.hasSSE2() && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower // compare is always unsigned. SDValue SB; if (FlipSigns) { SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32); } else { SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32); SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32); SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero}); } Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); // Create masks for only the low parts/high parts of the 64 bit integers. static const int MaskHi[] = { 1, 1, 3, 3 }; static const int MaskLo[] = { 0, 0, 2, 2 }; SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) { // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with // pcmpeqd + pshufd + pand. assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Do the compare. SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. static const int Mask[] = { 1, 0, 3, 2 }; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { MVT EltVT = VT.getVectorElementType(); SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM); Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); if (MinMax) Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); if (Subus) Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, getZeroVector(VT, Subtarget, DAG, dl)); return Result; } SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); ISD::CondCode CC = cast(Op.getOperand(2))->get(); // Optimize to BT if possible. // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG)) return NewSetCC; } // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. if ((isOneConstant(Op1) || isNullConstant(Op1)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); if (!Invert) return Op0; CCode = X86::GetOppositeBranchCondition(CCode); return getSETCC(CCode, Op0.getOperand(1), dl, DAG); } } bool IsFP = Op1.getSimpleValueType().isFloatingPoint(); X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG); if (X86CC == X86::COND_INVALID) return SDValue(); SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); return getSETCC(X86CC, EFLAGS, dl, DAG); } SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); X86::CondCode CC = TranslateIntegerX86CC(cast(Cond)->get()); // Recreate the carry if needed. EVT CarryVT = Carry.getValueType(); APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, DAG.getConstant(NegOne, DL, CarryVT)); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); return getSETCC(CC, Cmp.getValue(1), DL, DAG); } /// Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getOpcode(); if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || Opc == X86ISD::SAHF) return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND)) return true; if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) return true; return false; } static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; SDValue VOp0 = V.getOperand(0); unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool AddTest = true; SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); MVT VT = Op1.getSimpleValueType(); SDValue CC; // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget.hasSSE1() && VT == MVT::f32)) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); unsigned SSECC = translateX86FSETCC( cast(Cond.getOperand(2))->get(), CondOp0, CondOp1); if (Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); assert(!VT.isVector() && "Not a scalar type?"); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } if (SSECC < 8 || Subtarget.hasAVX()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); // If we have AVX, we can use a variable vector select (VBLENDV) instead // of 3 logic instructions for size savings and potentially speed. // Unfortunately, there is no scalar form of VBLENDV. // If either operand is a constant, don't try this. We can expect to // optimize away at least one of the logic instructions later in that // case, so that sequence would be faster than a variable blend. // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. if (Subtarget.hasAVX() && !isa(Op1) && !isa(Op2)) { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0, DL)); } SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); } } // AVX512 fallback is to lower selects of scalar floats to masked moves. if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) Op1Scalar = Op1.getOperand(0); SDValue Op2Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) Op2Scalar = Op2.getOperand(0); if (Op1Scalar.getNode() && Op2Scalar.getNode()) { SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond, Op1Scalar, Op2Scalar); if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, newSelect); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, DAG.getIntPtrConstant(0, DL)); } } if (VT == MVT::v4i1 || VT == MVT::v2i1) { SDValue zeroConst = DAG.getIntPtrConstant(0, DL); Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); } if (Cond.getOpcode() == ISD::SETCC) { if (SDValue NewCond = LowerSETCC(Cond, DAG)) { Cond = NewCond; // If the condition was updated, it's possible that the operands of the // select were also updated (for example, EmitTest has a RAUW). Refresh // the local references to the select operands in case they got stale. Op1 = Op.getOperand(1); Op2 = Op.getOperand(2); } } // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode = cast(Cond.getOperand(0))->getZExtValue(); if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb if (isNullConstant(Y) && (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0); SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), SDValue(Neg.getNode(), 1)); return Res; } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); Cmp = ConvertCmpIfNecessary(Cmp, DAG); SDValue Res = // Res = 0 or -1. DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E && Cmp.getOperand(0).getOpcode() == ISD::AND && isOneConstant(Cmp.getOperand(0).getOperand(1))) { SDValue CmpOp0 = Cmp.getOperand(0); SDValue Src1, Src2; // true if Op2 is XOR or OR operator and one of its operands // is equal to Op1 // ( a , a op b) || ( b , a op b) auto isOrXorPattern = [&]() { if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) && (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) { Src1 = Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0); Src2 = Op1; return true; } return false; }; if (isOrXorPattern()) { SDValue Neg; unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits(); // we need mask of all zeros or ones with same size of the other // operands. if (CmpSz > VT.getSizeInBits()) Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0); else if (CmpSz < VT.getSizeInBits()) Neg = DAG.getNode(ISD::AND, DL, VT, DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)), DAG.getConstant(1, DL, VT)); else Neg = CmpOp0; SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Neg); // -(and (x, 0x1)) SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y } } } // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && isOneConstant(Cond.getOperand(1))) Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); MVT VT = Op.getSimpleValueType(); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT)) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || Opc == X86ISD::BT) { // FIXME Cond = Cmp; AddTest = false; } } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && Cond.getOperand(0).getValueType() != MVT::i8)) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); unsigned X86Opcode; unsigned X86Cond; SDVTList VTs; switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; default: llvm_unreachable("unexpected overflowing operator"); } if (CondOpcode == ISD::UMULO) VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i32); else VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); if (CondOpcode == ISD::UMULO) Cond = X86Op.getValue(2); else Cond = X86Op.getValue(1); CC = DAG.getConstant(X86Cond, DL, MVT::i8); AddTest = false; } if (AddTest) { // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); AddTest = false; } } } if (AddTest) { CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); } // a < b ? -1 : 0 -> RES = ~setcc_carry // a < b ? 0 : -1 -> RES = setcc_carry // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == X86ISD::SUB) { Cond = ConvertCmpIfNecessary(Cond, DAG); unsigned CondCode = cast(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cond); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } } // X86 doesn't have an i8 cmov. If both operands are the result of a truncate // widen the cmov and push the truncate through. This avoids introducing a new // branch during isel and doesn't add any extensions. if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); if (T1.getValueType() == T2.getValueType() && // Blacklist CopyFromReg to avoid partial register stalls. T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, CC, Cond); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } } // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. SDValue Ops[] = { Op2, Op1, CC, Cond }; return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops); } static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); MVT VTElt = VT.getVectorElementType(); SDLoc dl(Op); unsigned NumElts = VT.getVectorNumElements(); // Extend VT if the scalar type is v8/v16 and BWI is not supported. MVT ExtVT = VT; if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) ExtVT = MVT::getVectorVT(MVT::i32, NumElts); // Widen to 512-bits if VLX is not supported. MVT WideVT = ExtVT; if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { NumElts *= 512 / ExtVT.getSizeInBits(); InVT = MVT::getVectorVT(MVT::i1, NumElts); In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In, DAG.getIntPtrConstant(0, dl)); WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); } SDValue V; MVT WideEltVT = WideVT.getVectorElementType(); if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) || (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) { V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG); } else { SDValue NegOne = getOnesVector(WideVT, DAG, dl); SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl); V = DAG.getSelect(dl, WideVT, In, NegOne, Zero); } // Truncate if we had to extend i16/i8 above. if (VT != ExtVT) { WideVT = MVT::getVectorVT(VTElt, NumElts); V = DAG.getNode(X86ISD::VTRUNC, dl, WideVT, V); } // Extract back to 128/256-bit if we widened. if (WideVT != VT) V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V, DAG.getIntPtrConstant(0, dl)); return V; } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); if (InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); if (Subtarget.hasFp256()) if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; return SDValue(); } // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. // For sign extend this needs to handle all vector sizes and SSE4.1 and // non-SSE4.1 targets. For zero extend this should only handle inputs of // MVT::v64i8 when BWI is not supported, but AVX512 is. static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); assert(VT.getSizeInBits() == InVT.getSizeInBits()); MVT SVT = VT.getVectorElementType(); MVT InSVT = InVT.getVectorElementType(); assert(SVT.getSizeInBits() > InSVT.getSizeInBits()); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) return SDValue(); if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && !(VT.is256BitVector() && Subtarget.hasInt256()) && !(VT.is512BitVector() && Subtarget.hasAVX512())) return SDValue(); SDLoc dl(Op); // For 256-bit vectors, we only need the lower (128-bit) half of the input. // For 512-bit vectors, we need 128-bits or 256-bits. if (VT.getSizeInBits() > 128) { // Input needs to be at least the same number of elements as output, and // at least 128-bits. int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements(); In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); } assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"); // SSE41 targets can use the pmovsx* instructions directly for 128-bit results, // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still // need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? X86ISD::VSEXT : X86ISD::VZEXT; return DAG.getNode(ExtOpc, dl, VT, In); } // We should only get here for sign extend. assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; MVT CurrVT = InVT; // As SRAI is only available on i16/i32 types, we expand only up to i32 // and handle i64 separately. while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) { Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); Curr = DAG.getBitcast(CurrVT, Curr); } SDValue SignExt = Curr; if (CurrVT != InVT) { unsigned SignExtShift = CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, DAG.getConstant(SignExtShift, dl, MVT::i8)); } if (CurrVT == VT) return SignExt; if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, DAG.getConstant(31, dl, MVT::i8)); SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); return DAG.getBitcast(VT, Ext); } return SDValue(); } static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); if (InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16) && (VT != MVT::v16i16 || InVT != MVT::v16i8) && (VT != MVT::v8i64 || InVT != MVT::v8i32) && (VT != MVT::v8i64 || InVT != MVT::v8i16) && (VT != MVT::v16i32 || InVT != MVT::v16i16) && (VT != MVT::v16i32 || InVT != MVT::v16i8) && (VT != MVT::v32i16 || InVT != MVT::v32i8)) return SDValue(); if (Subtarget.hasInt256()) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and // v4i32 to v4i64 // // Divide input vector into two parts // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 // concat the vectors to original VT unsigned NumElems = InVT.getVectorNumElements(); SDValue Undef = DAG.getUNDEF(InVT); SmallVector ShufMask1(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask1[i] = i; SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1); SmallVector ShufMask2(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask2[i] = i + NumElems/2; SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2); MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT); OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } // Lower truncating store. We need a special lowering to vXi1 vectors static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget, SelectionDAG &DAG) { StoreSDNode *St = cast(StOp.getNode()); SDLoc dl(St); EVT MemVT = St->getMemoryVT(); assert(St->isTruncatingStore() && "We only custom truncating store."); assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 && "Expected truncstore of i1 vector"); SDValue Op = St->getValue(); MVT OpVT = Op.getValueType().getSimpleVT(); unsigned NumElts = OpVT.getVectorNumElements(); if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || NumElts == 16) { // Truncate and store - everything is legal Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op); if (MemVT.getSizeInBits() < 8) Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, DAG.getUNDEF(MVT::v8i1), Op, DAG.getIntPtrConstant(0, dl)); return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(), St->getMemOperand()); } // A subset, assume that we have only AVX-512F if (NumElts <= 8) { if (NumElts < 8) { // Extend to 8-elts vector MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8); Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT, DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl)); } Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op); return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(), St->getMemOperand()); } // v32i8 assert(OpVT == MVT::v32i8 && "Unexpected operand type"); // Divide the vector into 2 parts and store each part separately SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op, DAG.getIntPtrConstant(0, dl)); Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo); SDValue BasePtr = St->getBasePtr(); SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr, St->getMemOperand()); SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op, DAG.getIntPtrConstant(16, dl)); Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi); SDValue BasePtrHi = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(2, dl, BasePtr.getValueType())); SDValue StHi = DAG.getStore(St->getChain(), dl, Hi, BasePtrHi, St->getMemOperand()); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi); } static SDValue LowerExtended1BitVectorLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); EVT MemVT = Ld->getMemoryVT(); assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 && "Expected i1 vector load"); unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; MVT VT = Op.getValueType().getSimpleVT(); unsigned NumElts = VT.getVectorNumElements(); if ((Subtarget.hasBWI() && NumElts >= 32) || (Subtarget.hasDQI() && NumElts < 16) || NumElts == 16) { // Load and extend - everything is legal if (NumElts < 8) { SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); // Replace chain users with the new chain. assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8); SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); // Replace chain users with the new chain. assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); // Finally, do a normal sign-extend to the desired register. return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load); } if (NumElts <= 8) { // A subset, assume that we have only AVX-512F unsigned NumBitsToLoad = 8; MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad); SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); // Replace chain users with the new chain. assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad); SDValue BitVec = DAG.getBitcast(MaskVT, Load); if (NumElts == 8) return DAG.getNode(ExtOpcode, dl, VT, BitVec); // we should take care to v4i1 and v2i1 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8); SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } assert(VT == MVT::v32i8 && "Unexpected extload type"); SDValue BasePtr = Ld->getBasePtr(); SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); SDValue BasePtrHi = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getConstant(2, dl, BasePtr.getValueType())); SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi, Ld->getMemOperand()); SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadLo.getValue(1), LoadHi.getValue(1)); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo); SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector sext loads."); assert(RegVT.isInteger() && "We only custom lower integer vector sext loads."); // Nothing useful we can do without SSE2 shuffles. assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); EVT MemVT = Ld->getMemoryVT(); if (MemVT.getScalarType() == MVT::i1) return LowerExtended1BitVectorLoad(Op, Subtarget, DAG); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); ISD::LoadExtType Ext = Ld->getExtensionType(); assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented."); assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); unsigned NumElems = RegVT.getVectorNumElements(); unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) { // The only way in which we have a legal 256-bit vector result but not the // integer 256-bit operations needed to directly lower a sextload is if we // have AVX1 but not AVX2. In that case, we can always emit a sextload to // a 128-bit vector and a normal sign_extend to 256-bits that should get // correctly legalized. We do this late to allow the canonical form of // sextload to persist throughout the rest of the DAG combiner -- it wants // to fold together any extensions it can, and so will fuse a sign_extend // of an sextload into a sextload targeting a wider value. SDValue Load; if (MemSz == 128) { // Just switch this to a normal load. assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " "it must be a legal 128-bit vector " "type!"); Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); } else { assert(MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"); // Do an sext load to a 128-bit vector type. We want to use the same // number of elements, but elements half as wide. This will end up being // recursively lowered by this routine, but will succeed as we definitely // have all the necessary features if we're using AVX1. EVT HalfEltVT = EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); Load = DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), MemVT, Ld->getAlignment(), Ld->getMemOperand()->getFlags()); } // Replace chain users with the new chain. assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); // Finally, do a normal sign-extend to the desired register. return DAG.getSExtOrTrunc(Load, dl, RegVT); } // All sizes must be a power of two. assert(isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!"); // Attempt to load the original value using scalar loads. // Find the largest scalar type that divides the total loaded size. MVT SclrLoadTy = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { SclrLoadTy = Tp; } } // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && (64 <= MemSz)) SclrLoadTy = MVT::f64; // Calculate the number of scalar loads that we need to perform // in order to load our vector from memory. unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && "Can only lower sext loads with a single scalar load!"); unsigned loadRegZize = RegSz; if (Ext == ISD::SEXTLOAD && RegSz >= 256) loadRegZize = 128; // If we don't have BWI we won't be able to create the shuffle needed for // v8i8->v8i64. if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) loadRegZize = 128; // Represent our vector as a sequence of elements which are the // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT( *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); // Represent the data using the same element type that is stored in // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), loadRegZize / MemVT.getScalarSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); // We can't shuffle using an illegal type. assert(TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types"); SmallVector Chains; SDValue Ptr = Ld->getBasePtr(); SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy(DAG.getDataLayout())); SDValue Res = DAG.getUNDEF(LoadUnitVecVT); for (unsigned i = 0; i < NumLoads; ++i) { // Perform a single load. SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); Chains.push_back(ScalarLoad.getValue(1)); // Create the first element type using SCALAR_TO_VECTOR in order to avoid // another round of DAGCombining. if (i == 0) Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); else Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, ScalarLoad, DAG.getIntPtrConstant(i, dl)); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { // If we have SSE4.1, we can directly emit a VSEXT node. if (Subtarget.hasSSE41()) { SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Sext; } // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest // lanes. assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"); SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) { SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Sext; } // Redistribute the loaded elements into the different locations. SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i * SizeRatio] = i; SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), ShuffleVec); // Bitcast to the requested type. Shuff = DAG.getBitcast(RegVT, Shuff); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes /// each of which has no other use apart from the AND / OR. static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { Opc = Op.getOpcode(); if (Opc != ISD::OR && Opc != ISD::AND) return false; return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse() && Op.getOperand(1).getOpcode() == X86ISD::SETCC && Op.getOperand(1).hasOneUse()); } /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the /// SETCC node has a single use. static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; if (isOneConstant(Op.getOperand(1))) return Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse(); return false; } SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); SDLoc dl(Op); SDValue CC; bool Inverted = false; if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast(Cond.getOperand(2))->get() == ISD::SETEQ && isNullConstant(Cond.getOperand(1)) && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || Cond.getOperand(0).getOpcode() == ISD::SSUBO || Cond.getOperand(0).getOpcode() == ISD::USUBO || Cond.getOperand(0).getOpcode() == ISD::SMULO || Cond.getOperand(0).getOpcode() == ISD::UMULO)) { Inverted = true; Cond = Cond.getOperand(0); } else { if (SDValue NewCond = LowerSETCC(Cond, DAG)) Cond = NewCond; } } #if 0 // FIXME: LowerXALUO doesn't handle these!! else if (Cond.getOpcode() == X86ISD::ADD || Cond.getOpcode() == X86ISD::SUB || Cond.getOpcode() == X86ISD::SMUL || Cond.getOpcode() == X86ISD::UMUL) Cond = LowerXALUO(Cond, DAG); #endif // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && isOneConstant(Cond.getOperand(1))) Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { Cond = Cmp; addTest = false; } else { switch (cast(CC)->getZExtValue()) { default: break; case X86::COND_O: case X86::COND_B: // These can only come from an arithmetic instruction with overflow, // e.g. SADDO, UADDO. Cond = Cond.getOperand(1); addTest = false; break; } } } CondOpcode = Cond.getOpcode(); if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && Cond.getOperand(0).getValueType() != MVT::i8)) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); unsigned X86Opcode; unsigned X86Cond; SDVTList VTs; // Keep this in sync with LowerXALUO, otherwise we might create redundant // instructions that can't be removed afterwards (i.e. X86ISD::ADD and // X86ISD::INC). switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: if (isOneConstant(RHS)) { X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: if (isOneConstant(RHS)) { X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; default: llvm_unreachable("unexpected overflowing operator"); } if (Inverted) X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); if (CondOpcode == ISD::UMULO) VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i32); else VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); if (CondOpcode == ISD::UMULO) Cond = X86Op.getValue(2); else Cond = X86Op.getValue(1); CC = DAG.getConstant(X86Cond, dl, MVT::i8); addTest = false; } else { unsigned CondOpc; if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { SDValue Cmp = Cond.getOperand(0).getOperand(1); if (CondOpc == ISD::OR) { // Also, recognize the pattern generated by an FCMP_UNE. We can emit // two branches instead of an explicit OR instruction with a // separate test. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp)) { CC = Cond.getOperand(0).getOperand(0); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = Cond.getOperand(1).getOperand(0); Cond = Cmp; addTest = false; } } else { // ISD::AND // Also, recognize the pattern generated by an FCMP_OEQ. We can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp) && Op.getNode()->hasOneUse()) { X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, dl, MVT::i8); SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); X86::CondCode CCode = (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, dl, MVT::i8); Cond = Cmp; addTest = false; } } } } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. // It should be transformed during dag combiner except when the condition // is set by a arithmetics with overflow node. X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, dl, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETOEQ) { // For FCMP_OEQ, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Op.getNode()->hasOneUse()) { SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } } } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETUNE) { // For FCMP_UNE, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Op.getNode()->hasOneUse()) { SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_UNE. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8); Cond = Cmp; addTest = false; Dest = FalseBB; } } } } if (addTest) { // Look pass the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; } } } if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; CC = DAG.getConstant(X86Cond, dl, MVT::i8); Cond = EmitTest(Cond, X86Cond, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cond); } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. // Calls to _alloca are needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool EmitStackProbe = !getStackProbeSymbolName(MF).empty(); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || SplitStack || EmitStackProbe; SDLoc dl(Op); // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); unsigned Align = cast(Op.getOperand(2))->getZExtValue(); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); bool Is64Bit = Subtarget.is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) Result = DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { // The 64 bit implementation of segmented stacks needs to clobber both r10 // r11. This makes it impossible to use it along with nested parameters. const Function &F = MF.getFunction(); for (const auto &A : F.args()) { if (A.hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); } } const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); } else { SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size); MF.getInfo()->setHasWinAlloca(true); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); if (Align) { SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } Result = SP; } Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); SDValue Ops[2] = {Result, Chain}; return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const Value *SV = cast(Op.getOperand(2))->getValue(); SDLoc DL(Op); if (!Subtarget.is64Bit() || Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } // __va_list_tag: // gp_offset (0 - 6 * 8) // fp_offset (48 - 48 + 8 * 16) // overflow_arg_area (point to parameters coming in memory). // reg_save_area SmallVector MemOps; SDValue FIN = Op.getOperand(1); // Store gp_offset SDValue Store = DAG.getStore( Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV)); MemOps.push_back(Store); // Store fp_offset FIN = DAG.getMemBasePlusOffset(FIN, 4, DL); Store = DAG.getStore( Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV, 4)); MemOps.push_back(Store); // Store ptr to overflow_arg_area FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8)); MemOps.push_back(Store); // Store ptr to reg_save_area. FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( Subtarget.isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); Store = DAG.getStore( Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12)); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); assert(Op.getNumOperands() == 4); MachineFunction &MF = DAG.getMachineFunction(); if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) // The Win64 ABI uses char* instead of a structure. return DAG.expandVAArg(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); unsigned Align = Op.getConstantOperandVal(3); SDLoc dl(Op); EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. // TODO: Implement the AMD64 ABI in its entirety. This simple // selection mechanism works only for the basic types. if (ArgVT == MVT::f80) { llvm_unreachable("va_arg for f80 not yet implemented"); } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { ArgMode = 2; // Argument passed in XMM register. Use fp_offset. } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. } else { llvm_unreachable("Unhandled argument type in LowerVAARG"); } if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()); } // Insert VAARG_64 node into the DAG // VAARG_64 returns two values: Variable Argument Address, Chain SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), DAG.getConstant(ArgMode, dl, MVT::i8), DAG.getConstant(Align, dl, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode( X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV), /*Align=*/0, MachineMemOperand::MOLoad | MachineMemOperand::MOStore); Chain = VAARG.getValue(1); // Load the next argument and return it return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo()); } static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, // where a va_list is still an i8*. assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"); if (Subtarget.isCallingConvWin64( DAG.getMachineFunction().getFunction().getCallingConv())) // Probably a Win64 va_copy. return DAG.expandVACopy(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); SDLoc DL(Op); return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false, false, false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } /// Handle vector element shifts where the shift amount is a constant. /// Takes immediate version of shift as input. static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); // Bitcast the source vector to the output type, this is mainly necessary for // vXi8/vXi64 shifts. if (VT != SrcOp.getSimpleValueType()) SrcOp = DAG.getBitcast(VT, SrcOp); // Fold this packed shift into its first operand if ShiftAmt is 0. if (ShiftAmt == 0) return SrcOp; // Check for ShiftAmt >= element width if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) ShiftAmt = ElementType.getSizeInBits() - 1; else return DAG.getConstant(0, dl, VT); } assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a // vector of Constants or UNDEFs. if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector Elts; unsigned NumElts = SrcOp->getNumOperands(); ConstantSDNode *ND; switch(Opc) { default: llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRLI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRAI: for (unsigned i=0; i!=NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); } break; } return DAG.getBuildVector(VT, dl, Elts); } return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, dl, MVT::i8)); } /// Handle vector element shifts where the shift amount may or may not be a /// constant. Takes immediate version of shift as input. static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT SVT = ShAmt.getSimpleValueType(); assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); // Catch shift-by-constant. if (ConstantSDNode *CShAmt = dyn_cast(ShAmt)) return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, CShAmt->getZExtValue(), DAG); // Change opcode to non-immediate version switch (Opc) { default: llvm_unreachable("Unknown target vector shift node"); case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; } // Need to build a vector containing shift amount. // SSE/AVX packed shifts only use the lower 64-bit of the shift count. // +=================+============+=======================================+ // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as | // +=================+============+=======================================+ // | i64 | Yes, No | Use ShAmt as lowest elt | // | i32 | Yes | zero-extend in-reg | // | (i32 zext(i16)) | Yes | zero-extend in-reg | // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) | // +=================+============+=======================================+ if (SVT == MVT::i64) ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt); else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { ShAmt = ShAmt.getOperand(0); ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt); ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else { SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps); } // The return type has to be a 128-bit type with the same element // type as the input type. MVT EltVT = VT.getVectorElementType(); MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); ShAmt = DAG.getBitcast(ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } /// \brief Return Mask with the necessary casting or extending /// for \p Mask according to \p MaskVT when lowering masking intrinsics static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { if (isAllOnesConstant(Mask)) return DAG.getConstant(1, dl, MaskVT); if (X86::isZeroNode(Mask)) return DAG.getConstant(0, dl, MaskVT); if (MaskVT.bitsGT(Mask.getSimpleValueType())) { // Mask should be extended Mask = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); } if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) { if (MaskVT == MVT::v64i1) { assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); // In case 32bit mode, bitcast i64 is illegal, extend/split it. SDValue Lo, Hi; Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, DAG.getConstant(0, dl, MVT::i32)); Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, DAG.getConstant(1, dl, MVT::i32)); Lo = DAG.getBitcast(MVT::v32i1, Lo); Hi = DAG.getBitcast(MVT::v32i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } else { // MaskVT require < 64bit. Truncate mask (should succeed in any case), // and bitcast. MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); return DAG.getBitcast(MaskVT, DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); } } else { MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getBitcast(BitcastVT, Mask), DAG.getIntPtrConstant(0, dl)); } } /// \brief Return (and \p Op, \p Mask) for compare instructions or /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the /// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); unsigned OpcodeSelect = ISD::VSELECT; SDLoc dl(Op); if (isAllOnesConstant(Mask)) return Op; SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); switch (Op.getOpcode()) { default: break; case X86ISD::CMPM: case X86ISD::CMPM_RND: case X86ISD::CMPMU: case X86ISD::VPSHUFBITQMB: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); case X86ISD::VFPCLASS: return DAG.getNode(ISD::OR, dl, VT, Op, VMask); case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: case X86ISD::CVTPS2PH: // We can't use ISD::VSELECT here because it is not always "Legal" // for the destination type. For example vpmovqb require only AVX512 // and vselect that can operate on byte element type require BWI OpcodeSelect = X86ISD::SELECT; break; } if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). /// The mask is coming as MVT::i8 and it should be transformed /// to MVT::v1i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node /// for a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (auto *MaskConst = dyn_cast(Mask)) if (MaskConst->getZExtValue() & 0x1) return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask); if (Op.getOpcode() == X86ISD::FSETCCM || Op.getOpcode() == X86ISD::FSETCCM_RND) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); if (Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::OR, dl, VT, Op, IMask); if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc); } static int getSEHRegistrationNodeSize(const Function *Fn) { if (!Fn->hasPersonalityFn()) report_fatal_error( "querying registration node size for function without personality"); // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See // WinEHStatePass for the full struct definition. switch (classifyEHPersonality(Fn->getPersonalityFn())) { case EHPersonality::MSVC_X86SEH: return 24; case EHPersonality::MSVC_CXX: return 16; default: break; } report_fatal_error( "can only recover FP for 32-bit MSVC EH personality functions"); } /// When the MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. /// Here's the math: /// RegNodeBase = EntryEBP - RegNodeSize /// ParentFP = RegNodeBase - ParentFrameOffset /// Subtracting RegNodeSize takes us to the offset of the registration node, and /// subtracting the offset (negative on x86) takes us back to the parent FP. static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP) { MachineFunction &MF = DAG.getMachineFunction(); SDLoc dl; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); // It's possible that the parent function no longer has a personality function // if the exceptional code was optimized away, in which case we just return // the incoming EBP. if (!Fn->hasPersonalityFn()) return EntryEBP; // Get an MCSymbol that will ultimately resolve to the frame offset of the EH // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( GlobalValue::dropLLVMManglingEscape(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after // prologue to RBP in the parent function. const X86Subtarget &Subtarget = static_cast(DAG.getSubtarget()); if (Subtarget.is64Bit()) return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); int RegNodeSize = getSEHRegistrationNodeSize(Fn); // RegNodeBase = EntryEBP - RegNodeSize // ParentFP = RegNodeBase - ParentFrameOffset SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, DAG.getConstant(RegNodeSize, dl, PtrVT)); return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { // Helper to detect if the operand is CUR_DIRECTION rounding mode. auto isRoundModeCurDirection = [](SDValue Rnd) { if (!isa(Rnd)) return false; unsigned Round = cast(Rnd)->getZExtValue(); return Round == X86::STATIC_ROUNDING::CUR_DIRECTION; }; SDLoc dl(Op); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { switch(IntrData->Type) { case INTR_TYPE_1OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); case INTR_TYPE_2OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case INTR_TYPE_4OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); case INTR_TYPE_1OP_MASK_RM: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue RoundingMode; // We always add rounding mode to the Node. // If the rounding mode is not specified, we add the // "current direction" mode. if (Op.getNumOperands() == 4) RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); else RoundingMode = Op.getOperand(4); assert(IntrData->Opc1 == 0 && "Unexpected second opcode!"); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, RoundingMode), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); // We add rounding mode to the Node when // - RM Opcode is specified and // - RM is not "current direction". unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); if (!isRoundModeCurDirection(Rnd)) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, Rnd), Mask, PassThru, Subtarget, DAG); } } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; // There are 2 kinds of intrinsics in this group: // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands // (2) With rounding mode and sae - 7 operands. bool HasRounding = IntrWithRoundingModeOpcode != 0; if (Op.getNumOperands() == (5U + HasRounding)) { if (HasRounding) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2, Rnd), Mask, passThru, Subtarget, DAG); } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru, Subtarget, DAG); } assert(Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"); SDValue RoundingMode = Op.getOperand(5); if (HasRounding) { SDValue Sae = Op.getOperand(6); if (!isRoundModeCurDirection(Sae)) return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2, RoundingMode, Sae), Mask, passThru, Subtarget, DAG); } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, RoundingMode), Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src0 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // There are 2 kinds of intrinsics in this group: // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands // (2) With rounding mode and sae - 7 operands. if (Op.getNumOperands() == 6) { SDValue Sae = Op.getOperand(5); return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Sae), Mask, Src0, Subtarget, DAG); } assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); SDValue RoundingMode = Op.getOperand(5); SDValue Sae = Op.getOperand(6); return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, RoundingMode, Sae), Mask, Src0, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: case INTR_TYPE_2OP_IMM8_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK) Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Rnd), Mask, PassThru, Subtarget, DAG); } } // TODO: Intrinsics should have fast-math-flags to propagate. return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // We specify 2 possible modes for intrinsics, with/without rounding // modes. // First, we check if the intrinsic have rounding mode (6 operands), // if not, we set rounding mode to "current". SDValue Rnd; if (Op.getNumOperands() == 6) Rnd = Op.getOperand(5); else Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Rnd), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(6); if (!isRoundModeCurDirection(Rnd)) return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2, Src3, Rnd), Mask, PassThru, Subtarget, DAG); } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Imm = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); // We specify 2 possible modes for intrinsics, with/without rounding // modes. // First, we check if the intrinsic have rounding mode (7 operands), // if not, we set rounding mode to "current". SDValue Rnd; if (Op.getNumOperands() == 7) Rnd = Op.getOperand(6); else Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Imm, Rnd), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_IMM8_MASK: case INTR_TYPE_3OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(6); if (!isRoundModeCurDirection(Rnd)) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, Rnd), Mask, PassThru, Subtarget, DAG); } } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case VPERM_2OP_MASK : { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // Swap Src1 and Src2 in the node creation return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1), Mask, PassThru, Subtarget, DAG); } case VPERM_3OP_MASKZ: case VPERM_3OP_MASK:{ MVT VT = Op.getSimpleValueType(); // Src2 is the PassThru SDValue Src1 = Op.getOperand(1); // PassThru needs to be the same type as the destination in order // to pattern match correctly. SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2)); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue PassThru = SDValue(); // set PassThru element if (IntrData->Type == VPERM_3OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else PassThru = Src2; // Swap Src1 and Src2 in the node creation return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src2, Src1, Src3), Mask, PassThru, Subtarget, DAG); } case FMA_OP_MASK3: case FMA_OP_MASKZ: case FMA_OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); MVT VT = Op.getSimpleValueType(); SDValue PassThru = SDValue(); // set PassThru element if (IntrData->Type == FMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else if (IntrData->Type == FMA_OP_MASK3) PassThru = Src3; else PassThru = Src1; // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, Rnd), Mask, PassThru, Subtarget, DAG); } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case FMA_OP_SCALAR_MASK: case FMA_OP_SCALAR_MASK3: case FMA_OP_SCALAR_MASKZ: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); MVT VT = Op.getSimpleValueType(); SDValue PassThru = SDValue(); // set PassThru element if (IntrData->Type == FMA_OP_SCALAR_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else if (IntrData->Type == FMA_OP_SCALAR_MASK3) PassThru = Src3; else PassThru = Src1; unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, Rnd), Mask, PassThru, Subtarget, DAG); } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case IFMA_OP_MASKZ: case IFMA_OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); MVT VT = Op.getSimpleValueType(); SDValue PassThru = Src1; // set PassThru element if (IntrData->Type == IFMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); // Node we need to swizzle the operands to pass the multiply operands // first. return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src2, Src3, Src1), Mask, PassThru, Subtarget, DAG); } case TERLOG_OP_MASK: case TERLOG_OP_MASKZ: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4)); SDValue Mask = Op.getOperand(5); MVT VT = Op.getSimpleValueType(); SDValue PassThru = Src1; // Set PassThru element. if (IntrData->Type == TERLOG_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3, Src4), Mask, PassThru, Subtarget, DAG); } case CVTPD2PS: // ISD::FP_ROUND has a second argument that indicates if the truncation // does not change the value. Set it to 0 since it can change. return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1), DAG.getIntPtrConstant(0, dl)); case CVTPD2PS_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); // We add rounding mode to the Node when // - RM Opcode is specified and // - RM is not "current direction". unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); if (!isRoundModeCurDirection(Rnd)) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, Rnd), Mask, PassThru, Subtarget, DAG); } } assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!"); // ISD::FP_ROUND has a second argument that indicates if the truncation // does not change the value. Set it to 0 since it can change. return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, DAG.getIntPtrConstant(0, dl)), Mask, PassThru, Subtarget, DAG); } case FPCLASS: { // FPclass intrinsics with mask SDValue Src1 = Op.getOperand(1); MVT VT = Src1.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), FPclassMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } case FPCLASSS: { SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask, DAG.getIntPtrConstant(0, dl)); } case CMP_MASK: case CMP_MASK_CC: { // Comparison intrinsics with masks. // Example of transformation: // (i8 (int_x86_avx512_mask_pcmpeq_q_128 // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> // (i8 (bitcast // (v8i1 (insert_subvector undef, // (v2i1 (and (PCMPEQM %a, %b), // (extract_subvector // (v8i1 (bitcast %mask)), 0))), 0)))) MVT VT = Op.getOperand(1).getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); SDValue Cmp; if (IntrData->Type == CMP_MASK_CC) { SDValue CC = Op.getOperand(3); CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC, Rnd); } //default rounding mode if(!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC); } else { assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2)); } SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), CmpMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); SDValue Mask = Op.getOperand(4); SDValue Cmp; if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd); } //default rounding mode if(!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask, DAG.getIntPtrConstant(0, dl)); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS); SDValue SetCC; switch (CC) { case ISD::SETEQ: { // (ZF = 0 and PF = 0) SetCC = getSETCC(X86::COND_E, Comi, dl, DAG); SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG); SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP); break; } case ISD::SETNE: { // (ZF = 1 or PF = 1) SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG); SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG); SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP); break; } case ISD::SETGT: // (CF = 0 and ZF = 0) SetCC = getSETCC(X86::COND_A, Comi, dl, DAG); break; case ISD::SETLT: { // The condition is opposite to GT. Swap the operands. SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG); break; } case ISD::SETGE: // CF = 0 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG); break; case ISD::SETLE: // The condition is opposite to GE. Swap the operands. SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG); break; default: llvm_unreachable("Unexpected illegal condition!"); } return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case COMI_RM: { // Comparison intrinsics with Sae SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); unsigned CondVal = cast(Op.getOperand(3))->getZExtValue(); SDValue Sae = Op.getOperand(4); SDValue FCmp; if (isRoundModeCurDirection(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8)); else FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8), Sae); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp, DAG.getIntPtrConstant(0, dl)); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), Subtarget, DAG); case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); if (isAllOnesConstant(Mask)) // return data as is return Op.getOperand(1); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), Mask, PassThru, Subtarget, DAG); } case BROADCASTM: { SDValue Mask = Op.getOperand(1); MVT MaskVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); Mask = DAG.getBitcast(MaskVT, Mask); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); } case MASK_BINOP: { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2); return DAG.getBitcast(VT, Res); } case FIXUPIMMS: case FIXUPIMMS_MASKZ: case FIXUPIMM: case FIXUPIMM_MASKZ:{ SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Imm = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ? Src1 : getZeroVector(VT, Subtarget, DAG, dl); // We specify 2 possible modes for intrinsics, with/without rounding // modes. // First, we check if the intrinsic have rounding mode (7 operands), // if not, we set rounding mode to "current". SDValue Rnd; if (Op.getNumOperands() == 7) Rnd = Op.getOperand(6); else Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ) return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3, Imm, Rnd), Mask, Passthru, Subtarget, DAG); else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3, Imm, Rnd), Mask, Passthru, Subtarget, DAG); } case CONVERT_TO_MASK: { MVT SrcVT = Op.getOperand(1).getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1)); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), CvtMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } case ROUNDP: { assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, Op.getOperand(2), DAG.getConstant(0xf, dl, MVT::i32)); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), RoundingMode); } case ROUNDS: { assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, Op.getOperand(3), DAG.getConstant(0xf, dl, MVT::i32)); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), RoundingMode); } default: break; } } switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::x86_avx2_permd: case Intrinsic::x86_avx2_permps: // Operands intentionally swapped. Mask is last operand to intrinsic, // but second operand for node/instruction. return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1)); // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestz_256: case Intrinsic::x86_avx_ptestc_256: case Intrinsic::x86_avx_ptestnzc_256: case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: case Intrinsic::x86_avx_vtestc_pd_256: case Intrinsic::x86_avx_vtestnzc_pd_256: { bool IsTestPacked = false; X86::CondCode X86CC; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: IsTestPacked = true; LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_avx_ptestz_256: // ZF = 1 X86CC = X86::COND_E; break; case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestc_pd_256: IsTestPacked = true; LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_avx_ptestc_256: // CF = 1 X86CC = X86::COND_B; break; case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestnzc_pd_256: IsTestPacked = true; LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestnzc_256: // ZF and CF = 0 X86CC = X86::COND_A; break; } SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_avx512_kortestz_w: case Intrinsic::x86_avx512_kortestc_w: { X86::CondCode X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B; SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_avx512_knot_w: { SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1); SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS); return DAG.getBitcast(MVT::i16, Res); } case Intrinsic::x86_avx512_kandn_w: { SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); // Invert LHS for the not. LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, DAG.getConstant(1, dl, MVT::v16i1)); SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS); return DAG.getBitcast(MVT::i16, Res); } case Intrinsic::x86_avx512_kxnor_w: { SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS); // Invert result for the not. Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res, DAG.getConstant(1, dl, MVT::v16i1)); return DAG.getBitcast(MVT::i16, Res); } case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: case Intrinsic::x86_sse42_pcmpestric128: case Intrinsic::x86_sse42_pcmpistrio128: case Intrinsic::x86_sse42_pcmpestrio128: case Intrinsic::x86_sse42_pcmpistris128: case Intrinsic::x86_sse42_pcmpestris128: case Intrinsic::x86_sse42_pcmpistriz128: case Intrinsic::x86_sse42_pcmpestriz128: { unsigned Opcode; X86::CondCode X86CC; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse42_pcmpistria128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpestria128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpistric128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpestric128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpistrio128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpestrio128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpistris128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpestris128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpistriz128: Opcode = X86ISD::PCMPISTRI; X86CC = X86::COND_E; break; case Intrinsic::x86_sse42_pcmpestriz128: Opcode = X86ISD::PCMPESTRI; X86CC = X86::COND_E; break; } SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_sse42_pcmpistri128: case Intrinsic::x86_sse42_pcmpestri128: { unsigned Opcode; if (IntNo == Intrinsic::x86_sse42_pcmpistri128) Opcode = X86ISD::PCMPISTRI; else Opcode = X86ISD::PCMPESTRI; SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); auto &Context = MF.getMMI().getContext(); MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") + Twine(MF.getFunctionNumber())); return DAG.getNode(getGlobalWrapperKind(), dl, VT, DAG.getMCSymbol(S, PtrVT)); } case Intrinsic::x86_seh_lsda: { // Compute the symbol for the LSDA. We know it'll get emitted later. MachineFunction &MF = DAG.getMachineFunction(); SDValue Op1 = Op.getOperand(1); auto *Fn = cast(cast(Op1)->getGlobal()); MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( GlobalValue::dropLLVMManglingEscape(Fn->getName())); // Generate a simple absolute symbol reference. This intrinsic is only // supported on 32-bit Windows, which isn't PIC. SDValue Result = DAG.getMCSymbol(LSDASym, VT); return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); } case Intrinsic::x86_seh_recoverfp: { SDValue FnOp = Op.getOperand(1); SDValue IncomingFPOp = Op.getOperand(2); GlobalAddressSDNode *GSD = dyn_cast(FnOp); auto *Fn = dyn_cast_or_null(GSD ? GSD->getGlobal() : nullptr); if (!Fn) report_fatal_error( "llvm.x86.seh.recoverfp must take a function as the first argument"); return recoverFramePointer(DAG, Fn, IncomingFPOp); } case Intrinsic::localaddress: { // Returns one of the stack, base, or frame pointer registers, depending on // which is used to reference local variables. MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned Reg; if (RegInfo->hasBasePointer(MF)) Reg = RegInfo->getBaseRegister(); else // This function handles the SP or FP case. Reg = RegInfo->getPtrSizedFrameRegister(MF); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } } } static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); EVT MaskVT = Mask.getValueType(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let ExecutionDepsFix deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; return DAG.getMergeValues(RetOps, dl); } static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let ExecutionDepsFix deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; return DAG.getMergeValues(RetOps, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); return SDValue(Res, 1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); } /// Handles the lowering of builtin intrinsic that return the value /// of the extended control register. static void getExtendedControlRegister(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl &Results) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue LO, HI; // The ECX register is used to select the index of the XCR register to // return. SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2)); SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain); Chain = SDValue(N1, 0); // Reads the content of XCR and returns it in registers EDX:EAX. if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); } else { LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, LO.getValue(2)); } Chain = HI.getValue(1); if (Subtarget.is64Bit()) { // Merge the two 32-bit values into a 64-bit one.. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { LO, HI }; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); } /// Handles the lowering of builtin intrinsics that read performance monitor /// counters (x86_rdpmc). static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl &Results) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue LO, HI; // The ECX register is used to select the index of the performance counter // to read. SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2)); SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); // Reads the content of a 64-bit performance counter and returns it in the // registers EDX:EAX. if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); } else { LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, LO.getValue(2)); } Chain = HI.getValue(1); if (Subtarget.is64Bit()) { // The EAX register is loaded with the low-order 32 bits. The EDX register // is loaded with the supported high-order bits of the counter. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { LO, HI }; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); } /// Handles the lowering of builtin intrinsics that read the time stamp counter /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower /// READCYCLECOUNTER nodes. static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl &Results) { SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); SDValue LO, HI; // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); } else { LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, LO.getValue(2)); } SDValue Chain = HI.getValue(1); if (Opcode == X86ISD::RDTSCP_DAG) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into // the ECX register. Add 'ecx' explicitly to the chain. SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, HI.getValue(2)); // Explicitly store the content of ECX at the location passed in input // to the 'rdtscp' intrinsic. Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), MachinePointerInfo()); } if (Subtarget.is64Bit()) { // The EDX register is loaded with the high-order 32 bits of the MSR, and // the EAX register is loaded with the low-order 32 bits. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { LO, HI }; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); } static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector Results; SDLoc DL(Op); getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); return DAG.getMergeValues(Results, DL); } static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue RegNode = Op.getOperand(2); WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); if (!EHInfo) report_fatal_error("EH registrations only live in functions using WinEH"); // Cast the operand to an alloca, and remember the frame index. auto *FINode = dyn_cast(RegNode); if (!FINode) report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); // Return the chain operand without making any DAG nodes. return Chain; } static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue EHGuard = Op.getOperand(2); WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); if (!EHInfo) report_fatal_error("EHGuard only live in functions using WinEH"); // Cast the operand to an alloca, and remember the frame index. auto *FINode = dyn_cast(EHGuard); if (!FINode) report_fatal_error("llvm.x86.seh.ehguard expects a static alloca"); EHInfo->EHGuardFrameIndex = FINode->getIndex(); // Return the chain operand without making any DAG nodes. return Chain; } /// Emit Truncating Store with signed or unsigned saturation. static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Undef = DAG.getUNDEF(Ptr.getValueType()); SDValue Ops[] = { Chain, Val, Ptr, Undef }; return SignedSat ? DAG.getTargetMemSDNode(VTs, Ops, Dl, MemVT, MMO) : DAG.getTargetMemSDNode(VTs, Ops, Dl, MemVT, MMO); } /// Emit Masked Truncating Store with signed or unsigned saturation. static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Ptr, Mask, Val }; return SignedSat ? DAG.getTargetMemSDNode(VTs, Ops, Dl, MemVT, MMO) : DAG.getTargetMemSDNode(VTs, Ops, Dl, MemVT, MMO); } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { switch (IntNo) { case llvm::Intrinsic::x86_seh_ehregnode: return MarkEHRegistrationNode(Op, DAG); case llvm::Intrinsic::x86_seh_ehguard: return MarkEHGuard(Op, DAG); case llvm::Intrinsic::x86_flags_read_u32: case llvm::Intrinsic::x86_flags_read_u64: case llvm::Intrinsic::x86_flags_write_u32: case llvm::Intrinsic::x86_flags_write_u64: { // We need a frame pointer because this will get lowered to a PUSH/POP // sequence. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later // during ExpandISelPseudos in EmitInstrWithCustomInserter. return SDValue(); } case Intrinsic::x86_lwpins32: case Intrinsic::x86_lwpins64: { SDLoc dl(Op); SDValue Chain = Op->getOperand(0); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); SDValue LwpIns = DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2), Op->getOperand(3), Op->getOperand(4)); SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG); SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, LwpIns.getValue(1)); } } return SDValue(); } SDLoc dl(Op); switch(IntrData->Type) { default: llvm_unreachable("Unknown Intrinsic Type"); case RDSEED: case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other); SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), DAG.getConstant(1, dl, Op->getValueType(1)), DAG.getConstant(X86::COND_B, dl, MVT::i8), SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } case GATHER_AVX2: { SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case GATHER: { //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { //scatter(base, mask, index, v1, scale); SDValue Chain = Op.getOperand(0); SDValue Base = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case PREFETCH: { SDValue Hint = Op.getOperand(6); unsigned HintVal = cast(Hint)->getZExtValue(); assert((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"); unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, Subtarget); } // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { SmallVector Results; getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. case RDPMC: { SmallVector Results; getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // Get Extended Control Register. case XGETBV: { SmallVector Results; getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG); SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, SDValue(InTrans.getNode(), 1)); } // ADC/ADCX/SBB case ADX: { SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32); SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), DAG.getConstant(-1, dl, MVT::i8)); SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), Op.getOperand(4), GenCF.getValue(1)); SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), Op.getOperand(5), MachinePointerInfo()); SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG); SDValue Results[] = { SetCC, Store }; return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { SDValue Mask = Op.getOperand(4); SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = DataToCompress.getSimpleValueType(); MemIntrinsicSDNode *MemIntr = dyn_cast(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); if (isAllOnesConstant(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, MemIntr->getMemOperand()); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT, MemIntr->getMemOperand(), false /* truncating */, true /* compressing */); } case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { SDValue Mask = Op.getOperand(4); SDValue DataToTruncate = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MemIntrinsicSDNode *MemIntr = dyn_cast(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); EVT MemVT = MemIntr->getMemoryVT(); uint16_t TruncationOp = IntrData->Opc0; switch (TruncationOp) { case X86ISD::VTRUNC: { if (isAllOnesConstant(Mask)) // return just a truncate store return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT, MemIntr->getMemOperand()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT, MemIntr->getMemOperand(), true /* truncating */); } case X86ISD::VTRUNCUS: case X86ISD::VTRUNCS: { bool IsSigned = (TruncationOp == X86ISD::VTRUNCS); if (isAllOnesConstant(Mask)) return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT, MemIntr->getMemOperand(), DAG); MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, VMask, MemVT, MemIntr->getMemOperand(), DAG); } default: llvm_unreachable("Unsupported truncstore intrinsic"); } } case EXPAND_FROM_MEM: { SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MVT VT = Op.getSimpleValueType(); MemIntrinsicSDNode *MemIntr = dyn_cast(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load. return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand()); if (X86::isZeroNode(Mask)) return DAG.getUNDEF(VT); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, MemIntr->getMemOperand(), ISD::NON_EXTLOAD, true /* expanding */); } } } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo()); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, MachinePointerInfo()); } SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const { DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true); return getReturnAddressFrameIndex(DAG); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); EVT VT = Op.getValueType(); MFI.setFrameAddressIsTaken(true); if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { // Depth > 0 makes no sense on targets which use Windows unwind codes. It // is not possible to crawl up the stack without looking at the unwind codes // simultaneously. int FrameAddrIndex = FuncInfo->getFAIndex(); if (!FrameAddrIndex) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); FrameAddrIndex = MF.getFrameInfo().CreateFixedObject( SlotSize, /*Offset=*/0, /*IsImmutable=*/false); FuncInfo->setFAIndex(FrameAddrIndex); } return DAG.getFrameIndex(FrameAddrIndex, VT); } unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const MachineFunction &MF = DAG.getMachineFunction(); unsigned Reg = StringSwitch(RegName) .Case("esp", X86::ESP) .Case("rsp", X86::RSP) .Case("ebp", X86::EBP) .Case("rbp", X86::RBP) .Default(0); if (Reg == X86::EBP || Reg == X86::RBP) { if (!TFI.hasFP(MF)) report_fatal_error("register " + StringRef(RegName) + " is allocatable: function has no frame pointer"); #ifndef NDEBUG else { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"); } #endif } if (Reg) return Reg; report_fatal_error("Invalid register name global variable"); } SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } unsigned X86TargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX; } unsigned X86TargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Funclet personalities don't use selectors (the runtime does the selection). assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; } bool X86TargetLowering::needsFixedCatchObjects() const { return Subtarget.isTargetWin64(); } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); SDLoc dl (Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"); SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, DAG.getIntPtrConstant(RegInfo->getSlotSize(), dl)); StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo()); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, DAG.getRegister(StoreAddrReg, PtrVT)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); // If the subtarget is not 64bit, we may need the global base reg // after isel expand pseudo, i.e., after CGBR pass ran. // Therefore, ask for the GlobalBaseReg now, so that the pass // inserts the code for us in case we need it. // Otherwise, we will end up in a situation where we will // reference a virtual register that is not defined! if (!Subtarget.is64Bit()) { const X86InstrInfo *TII = Subtarget.getInstrInfo(); (void)TII->getGlobalBaseReg(&DAG.getMachineFunction()); } return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); } SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other, Op.getOperand(0)); } static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { return Op.getOperand(0); } SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { SDValue Root = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value SDLoc dl (Op); const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); if (Subtarget.is64Bit()) { SDValue OutChains[6]; // Large code-model. const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix // Load the pointer to the nested function into R11. unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, dl, MVT::i64)); OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2), /* Alignment = */ 2); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(10, dl, MVT::i64)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 10)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, dl, MVT::i64)); OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12), /* Alignment = */ 2); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(20, dl, MVT::i64)); OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 20)); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(22, dl, MVT::i64)); OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 22)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { const Function *Func = cast(cast(Op.getOperand(5))->getValue()); CallingConv::ID CC = Func->getCallingConv(); unsigned NestReg; switch (CC) { default: llvm_unreachable("Unsupported calling convention"); case CallingConv::C: case CallingConv::X86_StdCall: { // Pass 'nest' parameter in ECX. // Must be kept in sync with X86CallingConv.td NestReg = X86::ECX; // Check that ECX wasn't needed by an 'inreg' parameter. FunctionType *FTy = Func->getFunctionType(); const AttributeList &Attrs = Func->getAttributes(); if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; unsigned Idx = 1; for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) if (Attrs.hasAttribute(Idx, Attribute::InReg)) { auto &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; } if (InRegCount > 2) { report_fatal_error("Nest register in use - reduce number of inreg" " parameters!"); } } break; } case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::Fast: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td NestReg = X86::EAX; break; } SDValue OutChains[4]; SDValue Addr, Disp; Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(10, dl, MVT::i32)); Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8), Trmp, MachinePointerInfo(TrmpAddr)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, dl, MVT::i32)); OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1), /* Alignment = */ 1); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, dl, MVT::i32)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 5), /* Alignment = */ 1); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, dl, MVT::i32)); OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6), /* Alignment = */ 1); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } } SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { /* The rounding mode is in bits 11:10 of FPSR, and has the following settings: 00 Round to nearest 01 Round to -inf 10 Round to +inf 11 Round to 0 FLT_ROUNDS, on the other hand, expects the following: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to +inf 3 Round to -inf To perform the conversion, we do: (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) */ MachineFunction &MF = DAG.getMachineFunction(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), MachineMemOperand::MOStore, 2, 2); SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo()); // Transform as necessary SDValue CWD1 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x800, DL, MVT::i16)), DAG.getConstant(11, DL, MVT::i8)); SDValue CWD2 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x400, DL, MVT::i16)), DAG.getConstant(9, DL, MVT::i8)); SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i16, DAG.getNode(ISD::ADD, DL, MVT::i16, DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), DAG.getConstant(1, DL, MVT::i16)), DAG.getConstant(3, DL, MVT::i16)); return DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } // Split an unary integer op into 2 half sized ops. static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); unsigned NumElems = VT.getVectorNumElements(); unsigned SizeInBits = VT.getSizeInBits(); // Extract the Lo/Hi vectors SDLoc dl(Op); SDValue Src = Op.getOperand(0); SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2); SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2); MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, Lo), DAG.getNode(Op.getOpcode(), dl, NewVT, Hi)); } // Decompose 256-bit ops into smaller 128-bit ops. static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return LowerVectorIntUnary(Op, DAG); } // Decompose 512-bit ops into smaller 256-bit ops. static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 512-bit vector integer operation"); return LowerVectorIntUnary(Op, DAG); } /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. // // i8/i16 vector implemented using dword LZCNT vector instruction // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, // split the vector, perform operation on it's Lo a Hi part and // concatenate the results. static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::CTLZ); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"); // Split vector, it's Lo and Hi parts will be handled in next iteration. if (16 < NumElems) return LowerVectorIntUnary(Op, DAG); MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && "Unsupported value type for operation"); // Use native supported vector instruction vplzcntd. Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); } // Lower CTLZ using a PSHUFB lookup table implementation. static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); int NumElts = VT.getVectorNumElements(); int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8); MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes); // Per-nibble leading zero PSHUFB lookup table. const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2, /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1, /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0, /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0}; SmallVector LUTVec; for (int i = 0; i < NumBytes; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec); // Begin by bitcasting the input to byte vector, then split those bytes // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them. // If the hi input nibble is zero then we add both results together, otherwise // we just take the hi result (by masking the lo result to zero before the // add). SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0)); SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL); SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT); SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask); SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); SDValue HiZ; if (CurrVT.is512BitVector()) { MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ); HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); } else { HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ); } Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo); Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi); Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ); SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi); // Merge result back from vXi8 back to VT, working on the lo/hi halves // of the current vector width in the same way we did for the nibbles. // If the upper half of the input element is zero then add the halves' // leading zero counts together, otherwise just use the upper half's. // Double the width of the result until we are at target width. while (CurrVT != VT) { int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits(); int CurrNumElts = CurrVT.getVectorNumElements(); MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2); MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2); SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT); // Check if the upper half of the input element is zero. if (CurrVT.is512BitVector()) { MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0), DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); } else { HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0), DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); } HiZ = DAG.getBitcast(NextVT, HiZ); // Move the upper/lower halves to the lower bits as we'll be extending to // NextVT. Mask the lower result to zero if HiZ is true and add the results // together. SDValue ResNext = Res = DAG.getBitcast(NextVT, Res); SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift); SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift); R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1); Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1); CurrVT = NextVT; } return Res; } static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (Subtarget.hasCDI()) return LowerVectorCTLZ_AVX512CDI(Op, DAG); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); // Decompose 512-bit ops into smaller 256-bit ops. if (VT.is512BitVector() && !Subtarget.hasBWI()) return Lower512IntUnary(Op, DAG); assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); } static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); unsigned Opc = Op.getOpcode(); if (VT.isVector()) return LowerVectorCTLZ(Op, dl, Subtarget, DAG); Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } // Issue a bsr (scan bits in reverse) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); if (Opc == ISD::CTLZ) { // If src is zero (i.e. bsr sets ZF), returns NumBits. SDValue Ops[] = { Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), DAG.getConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1) }; Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); } // Finally xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits - 1, dl, OpVT)); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); return Op; } static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); unsigned NumBits = VT.getScalarSizeInBits(); SDLoc dl(Op); if (VT.isVector()) { SDValue N0 = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, dl, VT); // lsb(x) = (x & -x) SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); // cttz_undef(x) = (width - 1) - ctlz(lsb) if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, DAG.getNode(ISD::CTLZ, dl, VT, LSB)); } // cttz(x) = ctpop(lsb - 1) SDValue One = DAG.getConstant(1, dl, VT); return DAG.getNode(ISD::CTPOP, dl, VT, DAG.getNode(ISD::SUB, dl, VT, LSB, One)); } assert(Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"); // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0)); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { Op, DAG.getConstant(NumBits, dl, VT), DAG.getConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1) }; return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } /// Break a 256-bit integer operation into two new 128-bit ones and then /// concatenate the result back. static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } /// Break a 512-bit integer operation into two new 256-bit ones and then /// concatenate the result back. static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl); SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl); SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl); MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT.getScalarType() == MVT::i1) return DAG.getNode(ISD::XOR, SDLoc(Op), VT, Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) { // Since X86 does not have CMOV for 8-bit integer, we don't convert // 8-bit integer abs to NEG and CMOV. SDLoc DL(Op); SDValue N0 = Op.getOperand(0); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), DAG.getConstant(0, DL, VT), N0); SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8), SDValue(Neg.getNode(), 1)}; return DAG.getNode(X86ISD::CMOV, DL, VT, Ops); } assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntUnary(Op, DAG); } static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); if (VT.getScalarType() == MVT::i1) return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntArith(Op, DAG); SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16 // vector pairs, multiply and truncate. if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) { if (Subtarget.hasInt256()) { // For 512-bit vectors, split into 256-bit vectors to allow the // sign-extension to occur. if (VT == MVT::v64i8) return Lower512IntArith(Op, DAG); // For 256-bit vectors, split into 128-bit vectors to allow the // sign-extension to occur. We don't need this on AVX512BW as we can // safely sign-extend to v32i16. if (VT == MVT::v32i8 && !Subtarget.hasBWI()) return Lower256IntArith(Op, DAG); MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); return DAG.getNode( ISD::TRUNCATE, dl, VT, DAG.getNode(ISD::MUL, dl, ExVT, DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A), DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B))); } assert(VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"); MVT ExVT = MVT::v8i16; // Extract the lo parts and sign extend to i16 SDValue ALo, BLo; if (Subtarget.hasSSE41()) { ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT); BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); ALo = DAG.getBitcast(ExVT, ALo); BLo = DAG.getBitcast(ExVT, BLo); ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); } // Extract the hi parts and sign extend to i16 SDValue AHi, BHi; if (Subtarget.hasSSE41()) { const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT); BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); AHi = DAG.getBitcast(ExVT, AHi); BHi = DAG.getBitcast(ExVT, BHi); AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); } // Multiply, mask the lower 8bits of the lo/hi results and pack SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT)); RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT)); return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. if (VT == MVT::v4i32) { assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() && "Should not custom lower when pmuldq is available!"); // Extract the odd parts. static const int UnpackMask[] = { 1, -1, 3, -1 }; SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); // Multiply the even parts. SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); // Now multiply odd parts. SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); Evens = DAG.getBitcast(VT, Evens); Odds = DAG.getBitcast(VT, Odds); // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. static const int ShufMask[] = { 0, 4, 2, 6 }; return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"); // 32-bit vector types used for MULDQ/MULUDQ. MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); // MULDQ returns the 64-bit result of the signed multiplication of the lower // 32-bits. We can lower with this if the sign bits stretch that far. if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 && DAG.ComputeNumSignBits(B) > 32) { return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A), DAG.getBitcast(MulVT, B)); } // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); // // AloBlo = pmuludq(a, b); // AloBhi = pmuludq(a, Bhi); // AhiBlo = pmuludq(Ahi, b); // // Hi = psllqi(AloBhi + AhiBlo, 32); // return AloBlo + Hi; APInt LowerBitsMask = APInt::getLowBitsSet(64, 32); bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask); bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask); APInt UpperBitsMask = APInt::getHighBitsSet(64, 32); bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask); bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask); // Bit cast to 32-bit vectors for MULUDQ. SDValue Alo = DAG.getBitcast(MulVT, A); SDValue Blo = DAG.getBitcast(MulVT, B); SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); // Only multiply lo/hi halves that aren't known to be zero. SDValue AloBlo = Zero; if (!ALoIsZero && !BLoIsZero) AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo); SDValue AloBhi = Zero; if (!ALoIsZero && !BHiIsZero) { SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); Bhi = DAG.getBitcast(MulVT, Bhi); AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi); } SDValue AhiBlo = Zero; if (!AHiIsZero && !BLoIsZero) { SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); Ahi = DAG.getBitcast(MulVT, Ahi); AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo); } SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo); Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG); return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi); } static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntArith(Op, DAG); // Only i8 vectors should need custom lowering after this. assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && "Unsupported vector type"); // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply, // logical shift down the upper half and pack back to i8. SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack // and then ashr/lshr the upper bits down to the lower bits before multiply. unsigned Opcode = Op.getOpcode(); unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA); unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND); // For 512-bit vectors, split into 256-bit vectors to allow the // sign-extension to occur. if (VT == MVT::v64i8) return Lower512IntArith(Op, DAG); // AVX2 implementations - extend xmm subvectors to ymm. if (Subtarget.hasInt256()) { unsigned NumElems = VT.getVectorNumElements(); SDValue Lo = DAG.getIntPtrConstant(0, dl); SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl); if (VT == MVT::v32i8) { if (Subtarget.hasBWI()) { SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A); SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB); Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul, DAG.getConstant(8, dl, MVT::v32i16)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); } SDValue ALo = extract128BitVector(A, 0, DAG, dl); SDValue BLo = extract128BitVector(B, 0, DAG, dl); SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl); SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl); ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo); BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo); AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi); BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi); Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16, DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo), DAG.getConstant(8, dl, MVT::v16i16)); Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16, DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi), DAG.getConstant(8, dl, MVT::v16i16)); // The ymm variant of PACKUS treats the 128-bit lanes separately, so before // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane. const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; return DAG.getNode(X86ISD::PACKUS, dl, VT, DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask), DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); } assert(VT == MVT::v16i8 && "Unexpected VT"); SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A); SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul, DAG.getConstant(8, dl, MVT::v16i16)); // If we have BWI we can use truncate instruction. if (Subtarget.hasBWI()) return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi); return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); } assert(VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"); MVT ExVT = MVT::v8i16; unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT); // Extract the lo parts and zero/sign extend to i16. SDValue ALo, BLo; if (Subtarget.hasSSE41()) { ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG); BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); ALo = DAG.getBitcast(ExVT, ALo); BLo = DAG.getBitcast(ExVT, BLo); ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); } // Extract the hi parts and zero/sign extend to i16. SDValue AHi, BHi; if (Subtarget.hasSSE41()) { const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG); BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); AHi = DAG.getBitcast(ExVT, AHi); BHi = DAG.getBitcast(ExVT, BHi); AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); } // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and // pack back to v16i8. SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT)); RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT)); return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget.isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); assert(VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering"); RTLIB::Libcall LC; bool isSigned; switch (Op->getOpcode()) { default: llvm_unreachable("Unexpected request for libcall!"); case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; } SDLoc dl(Op); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { EVT ArgVT = Op->getOperand(i).getValueType(); assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && "Unexpected argument type for lowering"); SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); Entry.Node = StackPtr; InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), /* Alignment = */ 16); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(InChain) .setLibCallee( getLibcallCallingConv(LC), static_cast(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee, std::move(Args)) .setInRegister() .setSExtResult(isSigned) .setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); return DAG.getBitcast(VT, CallInfo.first); } static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); MVT VT = Op0.getSimpleValueType(); SDLoc dl(Op); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) { unsigned Opcode = Op.getOpcode(); unsigned NumElems = VT.getVectorNumElements(); MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2); SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl); SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl); SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl); SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl); SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1); SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1); SDValue Ops[] = { DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)), DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1)) }; return DAG.getMergeValues(Ops, dl); } assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())); int NumElts = VT.getVectorNumElements(); // PMULxD operations multiply each even value (starting at 0) of LHS with // the related value of RHS and produce a widen result. // E.g., PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> // // In other word, to have all the results, we need to perform two PMULxD: // 1. one with the even values. // 2. one with the odd values. // To achieve #2, with need to place the odd values at an even position. // // Place the odd value at an even position (basically, shift all values 1 // step to the left): const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1}; // => SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, makeArrayRef(&Mask[0], NumElts)); // => SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, makeArrayRef(&Mask[0], NumElts)); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2); bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. SmallVector HighMask(NumElts); SmallVector LowMask(NumElts); for (int i = 0; i != NumElts; ++i) { HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1; LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts); } SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. if (IsSigned && !Subtarget.hasSSE41()) { SDValue ShAmt = DAG.getConstant( 31, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout())); SDValue T1 = DAG.getNode(ISD::AND, dl, VT, DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); SDValue T2 = DAG.getNode(ISD::AND, dl, VT, DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } // The first result of MUL_LOHI is actually the low value, followed by the // high value. SDValue Ops[] = {Lows, Highs}; return DAG.getMergeValues(Ops, dl); } // Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { if (VT.getScalarSizeInBits() < 16) return false; if (VT.is512BitVector() && Subtarget.hasAVX512() && (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) return true; bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) || (VT.is256BitVector() && Subtarget.hasInt256()); bool AShift = LShift && (Subtarget.hasAVX512() || (VT != MVT::v2i64 && VT != MVT::v4i64)); return (Opcode == ISD::SRA) ? AShift : LShift; } // The shift amount is a variable, but it is the same for all vector lanes. // These instructions are defined together with shift-immediate. static bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); } // Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16) return false; // vXi16 supported only on AVX-512, BWI if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) return false; if (Subtarget.hasAVX512()) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; return (Opcode == ISD::SRA) ? AShift : LShift; } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); SDValue Ex = DAG.getBitcast(ExVT, R); // ashr(R, 63) === cmp_slt(R, 0) if (ShiftAmt == 63 && Subtarget.hasSSE42()) { assert((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"); return DAG.getNode(X86ISD::PCMPGT, dl, VT, getZeroVector(VT, Subtarget, DAG, dl), R); } if (ShiftAmt >= 32) { // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, ShiftAmt - 32, DAG); if (VT == MVT::v2i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); if (VT == MVT::v4i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {9, 1, 11, 3, 13, 5, 15, 7}); } else { // SRA upper i32, SHL whole i64 and select lower i32. SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, ShiftAmt, DAG); SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); Lower = DAG.getBitcast(ExVT, Lower); if (VT == MVT::v2i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); if (VT == MVT::v4i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {8, 1, 10, 3, 12, 5, 14, 7}); } return DAG.getBitcast(VT, Ex); }; // Optimize shl/srl/sra with constant shift amount. if (auto *BVAmt = dyn_cast(Amt)) { if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { uint64_t ShiftAmt = ShiftConst->getZExtValue(); if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); // i64 SRA needs to be performed as partial shifts. if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || (Subtarget.hasInt256() && VT == MVT::v4i64)) && Op.getOpcode() == ISD::SRA) return ArithmeticShiftRight64(ShiftAmt); if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) || VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); // Simple i8 add case if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) return DAG.getNode(ISD::ADD, dl, VT, R, R); // ashr(R, 7) === cmp_slt(R, 0) if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); if (VT.is512BitVector()) { assert(VT == MVT::v64i8 && "Unexpected element type!"); SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R); return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP); } return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); } // XOP can shift v16i8 directly instead of as shift v8i16 + mask. if (VT == MVT::v16i8 && Subtarget.hasXOP()) return SDValue(); if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R, ShiftAmt, DAG); SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); } if (Op.getOpcode() == ISD::SRA) { // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } llvm_unreachable("Unknown shift opcode."); } } } // Check cases (mainly 32-bit) where i64 is expanded into high and low parts. // TODO: Replace constant extraction with getTargetConstantBitsFromNode. if (!Subtarget.hasXOP() && (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) || (Subtarget.hasAVX512() && VT == MVT::v8i64))) { // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant. unsigned SubVectorScale = 1; if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) { SubVectorScale = Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits(); Amt = Amt.getOperand(0); } // Peek through any splat that was introduced for i64 shift vectorization. int SplatIndex = -1; if (ShuffleVectorSDNode *SVN = dyn_cast(Amt.getNode())) if (SVN->isSplat()) { SplatIndex = SVN->getSplatIndex(); Amt = Amt.getOperand(0); assert(SplatIndex < (int)VT.getVectorNumElements() && "Splat shuffle referencing second operand"); } if (Amt.getOpcode() != ISD::BITCAST || Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR) return SDValue(); Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / (SubVectorScale * VT.getVectorNumElements()); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); for (unsigned i = 0; i != Ratio; ++i) { ConstantSDNode *C = dyn_cast(Amt.getOperand(i + BaseOp)); if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); } // Check remaining shift amounts (if not a splat). if (SplatIndex < 0) { for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { uint64_t ShAmt = 0; for (unsigned j = 0; j != Ratio; ++j) { ConstantSDNode *C = dyn_cast(Amt.getOperand(i + j)); if (!C) return SDValue(); // 6 == Log2(64) ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); } if (ShAmt != ShiftAmt) return SDValue(); } } if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); if (Op.getOpcode() == ISD::SRA) return ArithmeticShiftRight64(ShiftAmt); } return SDValue(); } static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL : (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA; if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) { SDValue BaseShAmt; MVT EltVT = VT.getVectorElementType(); if (BuildVectorSDNode *BV = dyn_cast(Amt)) { // Check if this build_vector node is doing a splat. // If so, then set BaseShAmt equal to the splat value. BaseShAmt = BV->getSplatValue(); if (BaseShAmt && BaseShAmt.isUndef()) BaseShAmt = SDValue(); } else { if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) Amt = Amt.getOperand(0); ShuffleVectorSDNode *SVN = dyn_cast(Amt); if (SVN && SVN->isSplat()) { unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); SDValue InVec = Amt.getOperand(0); if (InVec.getOpcode() == ISD::BUILD_VECTOR) { assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && "Unexpected shuffle index found!"); BaseShAmt = InVec.getOperand(SplatIdx); } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { if (ConstantSDNode *C = dyn_cast(InVec.getOperand(2))) { if (C->getZExtValue() == SplatIdx) BaseShAmt = InVec.getOperand(1); } } if (!BaseShAmt) // Avoid introducing an extract element from a shuffle. BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, DAG.getIntPtrConstant(SplatIdx, dl)); } } if (BaseShAmt.getNode()) { assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG); } } // Check cases (mainly 32-bit) where i64 is expanded into high and low parts. if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); std::vector Vals(Ratio); for (unsigned i = 0; i != Ratio; ++i) Vals[i] = Amt.getOperand(i); for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { for (unsigned j = 0; j != Ratio; ++j) if (Vals[j] != Amt.getOperand(i + j)) return SDValue(); } if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); } return SDValue(); } static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"); if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) return V; if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) return V; if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) return Op; // XOP has 128-bit variable logical/arithmetic shifts. // +ve/-ve Amt = shift left/right. if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) { if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) { SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); } if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); if (Op.getOpcode() == ISD::SRA) return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); } // 2i64 vector logical shifts can efficiently avoid scalarization - do the // shifts per-lane and then shuffle the partial results back together. if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { // Splat the shift amounts so the scalar shifts above will catch it. SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } // i64 vector arithmetic shift can be emulated with the transform: // M = lshr(SIGN_MASK, Amt) // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) && Op.getOpcode() == ISD::SRA) { SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT); SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); R = DAG.getNode(ISD::XOR, dl, VT, R, M); R = DAG.getNode(ISD::SUB, dl, VT, R, M); return R; } // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. if (ConstantAmt && Op.getOpcode() == ISD::SHL && (VT == MVT::v8i16 || VT == MVT::v4i32 || (Subtarget.hasInt256() && VT == MVT::v16i16))) { SmallVector Elts; MVT SVT = VT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); APInt One(SVTBits, 1); unsigned NumElems = VT.getVectorNumElements(); for (unsigned i=0; i !=NumElems; ++i) { SDValue Op = Amt->getOperand(i); if (Op->isUndef()) { Elts.push_back(Op); continue; } ConstantSDNode *ND = cast(Op); APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); uint64_t ShAmt = C.getZExtValue(); if (ShAmt >= SVTBits) { Elts.push_back(DAG.getUNDEF(SVT)); continue; } Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT)); } SDValue BV = DAG.getBuildVector(VT, dl, Elts); return DAG.getNode(ISD::MUL, dl, VT, R, BV); } // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, dl, VT)); Op = DAG.getBitcast(MVT::v4f32, Op); Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } // If possible, lower this shift as a sequence of two shifts by // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it. // Example: // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) // // Could be rewritten as: // (v4i32 (MOVSS (srl A, ), (srl A, ))) // // The advantage is that the two shifts from the example would be // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing // the vector shift into four scalar shifts plus four pairs of vector // insert/extract. if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) { bool UseMOVSD = false; bool CanBeSimplified; // The splat value for the first packed shift (the 'X' from the example). SDValue Amt1 = Amt->getOperand(0); // The splat value for the second packed shift (the 'Y' from the example). SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2); // See if it is possible to replace this node with a sequence of // two shifts followed by a MOVSS/MOVSD/PBLEND. if (VT == MVT::v4i32) { // Check if it is legal to use a MOVSS. CanBeSimplified = Amt2 == Amt->getOperand(2) && Amt2 == Amt->getOperand(3); if (!CanBeSimplified) { // Otherwise, check if we can still simplify this node using a MOVSD. CanBeSimplified = Amt1 == Amt->getOperand(1) && Amt->getOperand(2) == Amt->getOperand(3); UseMOVSD = true; Amt2 = Amt->getOperand(2); } } else { // Do similar checks for the case where the machine value type // is MVT::v8i16. CanBeSimplified = Amt1 == Amt->getOperand(1); for (unsigned i=3; i != 8 && CanBeSimplified; ++i) CanBeSimplified = Amt2 == Amt->getOperand(i); if (!CanBeSimplified) { UseMOVSD = true; CanBeSimplified = true; Amt2 = Amt->getOperand(4); for (unsigned i=0; i != 4 && CanBeSimplified; ++i) CanBeSimplified = Amt1 == Amt->getOperand(i); for (unsigned j=4; j != 8 && CanBeSimplified; ++j) CanBeSimplified = Amt2 == Amt->getOperand(j); } } if (CanBeSimplified && isa(Amt1) && isa(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND. SDValue Splat1 = DAG.getConstant(cast(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); SDValue Splat2 = DAG.getConstant(cast(Amt2)->getAPIntValue(), dl, VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1); SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2); if (UseMOVSD) return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1, BitCast2, {0, 1, 6, 7})); return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1, BitCast2, {0, 5, 6, 7})); } } // v4i32 Non Uniform Shifts. // If the shift amount is constant we can shift each lane using the SSE2 // immediate shifts, else we need to zero-extend each lane to the lower i64 // and shift using the SSE2 variable shifts. // The separate results can then be blended together. if (VT == MVT::v4i32) { unsigned Opc = Op.getOpcode(); SDValue Amt0, Amt1, Amt2, Amt3; if (ConstantAmt) { Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); } else { // ISD::SHL is handled above but we include it here for completeness. switch (Opc) { default: llvm_unreachable("Unknown target vector shift node"); case ISD::SHL: Opc = X86ISD::VSHL; break; case ISD::SRL: Opc = X86ISD::VSRL; break; case ISD::SRA: Opc = X86ISD::VSRA; break; } // The SSE2 shifts use the lower i64 as the same shift amount for // all lanes and the upper i64 is ignored. These shuffle masks // optimally zero-extend each lanes on SSE2/SSE41/AVX targets. SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); } SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2); SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3); SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); } // It's worth extending once and using the vXi16/vXi32 shifts for smaller // types, but without AVX512 the extra overheads to get from vXi8 to vXi32 // make the existing SSE solution better. if ((Subtarget.hasInt256() && VT == MVT::v8i16) || (Subtarget.hasAVX512() && VT == MVT::v16i16) || (Subtarget.hasAVX512() && VT == MVT::v16i8) || (Subtarget.hasBWI() && VT == MVT::v32i8)) { assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && "Unexpected vector type"); MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32; MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements()); unsigned ExtOpc = Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; R = DAG.getNode(ExtOpc, dl, ExtVT, R); Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); } if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { if (VT.is512BitVector()) { // On AVX512BW targets we make use of the fact that VSELECT lowers // to a masked blend which selects bytes based just on the sign bit // extracted to a mask. MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel); return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } else if (Subtarget.hasSSE41()) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); return DAG.getSelect(dl, SelVT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; // We can safely do this using i16 shifts as we're only interested in // the 3 lower bits of each byte. Amt = DAG.getBitcast(ExtVT, Amt); Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT)); Amt = DAG.getBitcast(VT, Amt); if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) { // r = VSELECT(r, shift(r, 4), a); SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 2), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // return VSELECT(r, shift(r, 1), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); R = SignBitSelect(VT, Amt, M, R); return R; } if (Op->getOpcode() == ISD::SRA) { // For SRA we need to unpack each byte to the higher byte of a i16 vector // so we can correctly sign extend. We don't care what happens to the // lower byte. SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt); SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt); SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R); SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); RHi = DAG.getBitcast(ExtVT, RHi); // r = VSELECT(r, shift(r, 4), a); SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, DAG.getConstant(4, dl, ExtVT)); SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, DAG.getConstant(4, dl, ExtVT)); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // a += a ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); // r = VSELECT(r, shift(r, 2), a); MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, DAG.getConstant(2, dl, ExtVT)); MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, DAG.getConstant(2, dl, ExtVT)); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // a += a ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); // r = VSELECT(r, shift(r, 1), a); MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, DAG.getConstant(1, dl, ExtVT)); MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, DAG.getConstant(1, dl, ExtVT)); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // Logical shift the result back to the lower byte, leaving a zero upper // byte // meaning that we can safely pack with PACKUSWB. RLo = DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT)); RHi = DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT)); return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } } if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R); SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); RHi = DAG.getBitcast(ExtVT, RHi); SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo); SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi); Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT)); Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT)); return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); } if (VT == MVT::v8i16) { unsigned ShiftOpcode = Op->getOpcode(); // If we have a constant shift amount, the non-SSE41 path is best as // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW. bool UseSSE41 = Subtarget.hasSSE41() && !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. if (UseSSE41) { MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); Sel = DAG.getBitcast(ExtVT, Sel); return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1)); } // On pre-SSE41 targets we splat the sign bit - a negative value will // set all bits of the lanes to true and VSELECT uses that in // its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue C = DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); return DAG.getSelect(dl, VT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 12; if (UseSSE41) { // On SSE41 targets we need to replicate the shift mask in both // bytes for PBLENDVB. Amt = DAG.getNode( ISD::OR, dl, VT, DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)), DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT))); } else { Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)); } // r = VSELECT(r, shift(r, 8), a); SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT)); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 4), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 2), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // return VSELECT(r, shift(r, 1), a); M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); R = SignBitSelect(Amt, M, R); return R; } // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.is256BitVector()) return Lower256IntArith(Op, DAG); return SDValue(); } static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); if (Subtarget.hasAVX512()) { // Attempt to rotate by immediate. APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) { if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) { return EltBits[0] == V; })) { unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits); return DAG.getNode(Op, DL, VT, R, DAG.getConstant(RotateAmt, DL, MVT::i8)); } } // Else, fall-back on VPROLV/VPRORV. return Op; } assert(VT.isVector() && "Custom lowering only for vector rotates!"); assert(Subtarget.hasXOP() && "XOP support required for vector rotates!"); assert((Opcode == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL. // Split 256-bit integers. if (VT.is256BitVector()) return Lower256IntArith(Op, DAG); assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); // Attempt to rotate by immediate. if (auto *BVAmt = dyn_cast(Amt)) { if (auto *RotateConst = BVAmt->getConstantSplatNode()) { uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); assert(RotateAmt < EltSizeInBits && "Rotation out of range"); return DAG.getNode(X86ISD::VROTLI, DL, VT, R, DAG.getConstant(RotateAmt, DL, MVT::i8)); } } // Use general rotate by variable (per-element). return Op; } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" // has only one use. SDNode *N = Op.getNode(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); unsigned BaseOp = 0; X86::CondCode Cond; SDLoc DL(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. if (isOneConstant(RHS)) { BaseOp = X86ISD::INC; Cond = X86::COND_O; break; } BaseOp = X86ISD::ADD; Cond = X86::COND_O; break; case ISD::UADDO: BaseOp = X86ISD::ADD; Cond = X86::COND_B; break; case ISD::SSUBO: // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. if (isOneConstant(RHS)) { BaseOp = X86ISD::DEC; Cond = X86::COND_O; break; } BaseOp = X86ISD::SUB; Cond = X86::COND_O; break; case ISD::USUBO: BaseOp = X86ISD::SUB; Cond = X86::COND_B; break; case ISD::SMULO: BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs if (N->getValueType(0) == MVT::i8) { BaseOp = X86ISD::UMUL8; Cond = X86::COND_O; break; } SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG); if (N->getValueType(1) == MVT::i1) SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } } // Also sets EFLAGS. SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG); if (N->getValueType(1) == MVT::i1) SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } /// Returns true if the operand type is exactly twice the native width, and /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b else if (OpWidth == 128) return Subtarget.hasCmpxchg16b(); else return false; } bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { return needsCmpXchgNb(SI->getValueOperand()->getType()); } // Note: this turns large loads into lock cmpxchg8b/16b. // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { auto PTy = cast(LI->getPointerOperandType()); return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. if (MemType->getPrimitiveSizeInBits() > NativeWidth) { return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } AtomicRMWInst::BinOp Op = AI->getOperation(); switch (Op) { default: llvm_unreachable("Unknown atomic operation"); case AtomicRMWInst::Xchg: case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. return AtomicExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. return !AI->use_empty() ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return AtomicExpansionKind::CmpXChg; } } LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually // harmful as it introduces a mfence. if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); auto Ptr = AI->getPointerOperand(); // Before the load we need a fence. Here is an example lifted from // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence // is required: // Thread 0: // x.store(1, relaxed); // r1 = y.fetch_add(0, release); // Thread 1: // y.fetch_add(42, acquire); // r2 = x.load(relaxed); // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. if (SSID == SyncScope::SingleThread) // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; if (!Subtarget.hasMFence()) // FIXME: it might make sense to use a locked operation here but on a // different cache-line to prevent cache-line bouncing. In practice it // is probably a small win, and x86 processors without mfence are rare // enough that we do not bother. return nullptr; Function *MFence = llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, AI->getType()->getPrimitiveSizeInBits()); Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; } static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast( cast(Op.getOperand(1))->getZExtValue()); SyncScope::ID FenceSSID = static_cast( cast(Op.getOperand(2))->getZExtValue()); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && FenceSSID == SyncScope::System) { if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, dl, MVT::i32); SDValue Ops[] = { DAG.getRegister(X86::ESP, MVT::i32), // Base DAG.getTargetConstant(1, dl, MVT::i8), // Scale DAG.getRegister(0, MVT::i32), // Index DAG.getTargetConstant(0, dl, MVT::i32), // Disp DAG.getRegister(0, MVT::i32), // Segment. Zero, Chain }; SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); return SDValue(Res, 0); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); } static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT T = Op.getSimpleValueType(); SDLoc DL(Op); unsigned Reg = 0; unsigned size = 0; switch(T.SimpleTy) { default: llvm_unreachable("Invalid value type!"); case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; case MVT::i64: assert(Subtarget.is64Bit() && "Node not type legal!"); Reg = X86::RAX; size = 8; break; } SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), DAG.getTargetConstant(size, DL, MVT::i8), cpIn.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, MVT::i32, cpOut.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG); DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); return SDValue(); } static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); SDValue Op0 = Op->getOperand(0); SmallVector Elts; SDLoc dl(Op); unsigned NumElts; MVT SVT; if (SrcVT.isVector()) { NumElts = SrcVT.getVectorNumElements(); SVT = SrcVT.getVectorElementType(); // Widen the vector in input in the case of MVT::v2i32. // Example: from MVT::v2i32 to MVT::v4i32. for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, DAG.getIntPtrConstant(i, dl))); } else { assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && "Unexpected source type in LowerBITCAST"); Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, DAG.getIntPtrConstant(0, dl))); Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, DAG.getIntPtrConstant(1, dl))); NumElts = 2; SVT = MVT::i32; } // Explicitly mark the extra elements as Undef. Elts.append(NumElts, DAG.getUNDEF(SVT)); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue BV = DAG.getBuildVector(NewVT, dl, Elts); SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, DAG.getIntPtrConstant(0, dl)); } assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"); // i64 <=> MMX conversions are Legal. if (SrcVT==MVT::i64 && DstVT.isVector()) return Op; if (DstVT==MVT::i64 && SrcVT.isVector()) return Op; // MMX <=> MMX conversions are Legal. if (SrcVT.isVector() && DstVT.isVector()) return Op; // All other conversions need to be expanded. return SDValue(); } /// Compute the horizontal sum of bytes in V for the elements of VT. /// /// Requires V to be a byte vector and VT to be an integer vector type with /// wider elements than V's type. The width of the elements of VT determines /// how many bytes of V are summed horizontally to produce each element of the /// result. static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(V); MVT ByteVecVT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); assert(ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."); assert(EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"); unsigned VecSize = VT.getSizeInBits(); assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"); // PSADBW instruction horizontally add all bytes and leave the result in i64 // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } if (EltVT == MVT::i32) { // We unpack the low half and high half into i32s interleaved with zeros so // that we can use PSADBW to horizontally sum them. The most useful part of // this is that it lines up the results of two PSADBW instructions to be // two v2i64 vectors which concatenated are the 4 population counts. We can // then use PACKUSWB to shrink and concatenate them into a v4i32 again. SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); SDValue V32 = DAG.getBitcast(VT, V); SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros); SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros); // Do the horizontal sums into two v2i64s. Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, DAG.getBitcast(ShortVecVT, Low), DAG.getBitcast(ShortVecVT, High)); return DAG.getBitcast(VT, V); } // The only element type left is i16. assert(EltVT == MVT::i16 && "Unknown how to handle type"); // To obtain pop count for each i16 element starting from the pop count for // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s // right by 8. It is important to shift as i16s as i8 vector shift isn't // directly supported. SDValue ShifterV = DAG.getConstant(8, DL, VT); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV); V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), DAG.getBitcast(ByteVecVT, V)); return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV); } static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned VecSize = VT.getSizeInBits(); // Implement a lookup table in register by using an algorithm based on: // http://wm.ite.pl/articles/sse-popcount.html // // The general idea is that every lower byte nibble in the input vector is an // index into a in-register pre-computed pop count table. We then split up the // input vector in two new ones: (1) a vector with only the shifted-right // higher nibbles for each byte and (2) a vector with the lower nibbles (and // masked out higher ones) for each byte. PSHUFB is used separately with both // to index the in-register table. Next, both are added and the result is a // i8 vector where each element contains the pop count for input byte. // // To obtain the pop count for elements != i8, we follow up with the same // approach and use additional tricks as described below. // const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; int NumByteElts = VecSize / 8; MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); SDValue In = DAG.getBitcast(ByteVecVT, Op); SmallVector LUTVec; for (int i = 0; i < NumByteElts; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec); SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT); // High nibbles SDValue FourV = DAG.getConstant(4, DL, ByteVecVT); SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); // Low nibbles SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); // The input vector is used as the shuffle mask that index elements into the // LUT. After counting low and high nibbles, add the vector to obtain the // final pop count per i8 element. SDValue HighPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); SDValue LowPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); if (EltVT == MVT::i8) return PopCnt; return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG); } static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is128BitVector() && "Only 128-bit vector bitmath lowering supported."); int VecSize = VT.getSizeInBits(); MVT EltVT = VT.getVectorElementType(); int Len = EltVT.getSizeInBits(); // This is the vectorized version of the "best" algorithm from // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel // with a minor tweak to use a series of adds + shifts instead of vector // multiplications. Implemented for all integer vector types. We only use // this when we don't have SSSE3 which allows a LUT-based lowering that is // much faster, even faster than using native popcnt instructions. auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) { MVT VT = V.getSimpleValueType(); SDValue ShifterV = DAG.getConstant(Shifter, DL, VT); return DAG.getNode(OpCode, DL, VT, V, ShifterV); }; auto GetMask = [&](SDValue V, APInt Mask) { MVT VT = V.getSimpleValueType(); SDValue MaskV = DAG.getConstant(Mask, DL, VT); return DAG.getNode(ISD::AND, DL, VT, V, MaskV); }; // We don't want to incur the implicit masks required to SRL vNi8 vectors on // x86, so set the SRL type to have elements at least i16 wide. This is // correct because all of our SRLs are followed immediately by a mask anyways // that handles any bits that sneak into the high bits of the byte elements. MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16); SDValue V = Op; // v = v - ((v >> 1) & 0x55555555...) SDValue Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55))); V = DAG.getNode(ISD::SUB, DL, VT, V, And); // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33))); Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33))); V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); // v = (v + (v >> 4)) & 0x0F0F0F0F... Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F))); // At this point, V contains the byte-wise population count, and we are // merely doing a horizontal sum if necessary to get the wider element // counts. if (EltVT == MVT::i8) return V; return LowerHorizontalByteSum( DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget, DAG); } // Please ensure that any codegen change from LowerVectorCTPOP is reflected in // updated cost models in X86TTIImpl::getIntrinsicInstrCost. static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && "Unknown CTPOP type to handle"); SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. if (Subtarget.hasVPOPCNTDQ()) { unsigned NumElems = VT.getVectorNumElements(); assert((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"); if (NumElems <= 16) { MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0); Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op); return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); } } if (!Subtarget.hasSSSE3()) { // We can't use the fast LUT approach, so fall back on vectorized bitmath. assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); } // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); // Decompose 512-bit ops into smaller 256-bit ops. if (VT.is512BitVector() && !Subtarget.hasBWI()) return Lower512IntUnary(Op, DAG); return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."); return LowerVectorCTPOP(Op, Subtarget, DAG); } static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); SDLoc DL(Op); // For scalars, its still beneficial to transfer to/from the SIMD unit to // perform the BITREVERSE. if (!VT.isVector()) { MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits()); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In); Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } int NumElts = VT.getVectorNumElements(); int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector()) return Lower256IntUnary(Op, DAG); assert(VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."); // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we // perform the BSWAP in the shuffle. // Its best to shuffle using the second operand as this will implicitly allow // memory folding for multiple vectors. SmallVector MaskElts; for (int i = 0; i != NumElts; ++i) { for (int j = ScalarSizeInBytes - 1; j >= 0; --j) { int SourceByte = 16 + (i * ScalarSizeInBytes) + j; int PermuteByte = SourceByte | (2 << 5); MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8)); } } SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts); SDValue Res = DAG.getBitcast(MVT::v16i8, In); Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8), Res, Mask); return DAG.getBitcast(VT, Res); } static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (Subtarget.hasXOP() && !VT.is512BitVector()) return LowerBITREVERSE_XOP(Op, DAG); assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); SDValue In = Op.getOperand(0); SDLoc DL(Op); unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"); // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); // Perform BITREVERSE using PSHUFB lookups. Each byte is split into // two nibbles and a PSHUFB lookup to find the bitreverse of each // 0-15 value (moved to the other nibble). SDValue NibbleMask = DAG.getConstant(0xF, DL, VT); SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask); SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT)); const int LoLUT[16] = { /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0, /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0, /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0, /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0}; const int HiLUT[16] = { /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C, /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E, /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D, /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F}; SmallVector LoMaskElts, HiMaskElts; for (unsigned i = 0; i < NumElts; ++i) { LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8)); HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8)); } SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts); SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts); Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo); Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi); return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool AllowIncDec = true) { unsigned NewOpc = 0; switch (N->getOpcode()) { case ISD::ATOMIC_LOAD_ADD: NewOpc = X86ISD::LADD; break; case ISD::ATOMIC_LOAD_SUB: NewOpc = X86ISD::LSUB; break; case ISD::ATOMIC_LOAD_OR: NewOpc = X86ISD::LOR; break; case ISD::ATOMIC_LOAD_XOR: NewOpc = X86ISD::LXOR; break; case ISD::ATOMIC_LOAD_AND: NewOpc = X86ISD::LAND; break; default: llvm_unreachable("Unknown ATOMIC_LOAD_ opcode"); } MachineMemOperand *MMO = cast(N)->getMemOperand(); if (auto *C = dyn_cast(N->getOperand(2))) { // Convert to inc/dec if they aren't slow or we are optimizing for size. if (AllowIncDec && (!Subtarget.slowIncDec() || DAG.getMachineFunction().getFunction().optForSize())) { if ((NewOpc == X86ISD::LADD && C->isOne()) || (NewOpc == X86ISD::LSUB && C->isAllOnesValue())) return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), {N->getOperand(0), N->getOperand(1)}, /*MemVT=*/N->getSimpleValueType(0), MMO); if ((NewOpc == X86ISD::LSUB && C->isOne()) || (NewOpc == X86ISD::LADD && C->isAllOnesValue())) return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), {N->getOperand(0), N->getOperand(1)}, /*MemVT=*/N->getSimpleValueType(0), MMO); } } return DAG.getMemIntrinsicNode( NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), {N->getOperand(0), N->getOperand(1), N->getOperand(2)}, /*MemVT=*/N->getSimpleValueType(0), MMO); } /// Lower atomic_load_ops into LOCK-prefixed operations. static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Chain = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); unsigned Opc = N->getOpcode(); MVT VT = N->getSimpleValueType(0); SDLoc DL(N); // We can lower atomic_load_add into LXADD. However, any other atomicrmw op // can only be lowered when the result is unused. They should have already // been transformed into a cmpxchg loop in AtomicExpand. if (N->hasAnyUseOfValue(0)) { // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to // select LXADD if LOCK_SUB can't be selected. if (Opc == ISD::ATOMIC_LOAD_SUB) { AtomicSDNode *AN = cast(N.getNode()); RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); } assert(Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"); return N; } SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1)); return SDValue(); } static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); EVT VT = cast(Node)->getMemoryVT(); // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) // FIXME: On 32-bit, store -> fist or movq would be more efficient // (The only way to get a 16-byte store is cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. if (cast(Node)->getOrdering() == AtomicOrdering::SequentiallyConsistent || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, cast(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2), cast(Node)->getMemOperand()); return Swap.getValue(1); } // Other atomic stores have a simple pattern. return Op; } static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); MVT VT = N->getSimpleValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDLoc DL(N); // Set the carry flag. SDValue Carry = Op.getOperand(2); EVT CarryVT = Carry.getValueType(); APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, DAG.getConstant(NegOne, DL, CarryVT)); unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB; SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0), Op.getOperand(1), Carry.getValue(1)); SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG); if (N->getValueType(1) == MVT::i1) SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or // { double, double } (which is returned in XMM0, XMM1). SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; const char *LibcallName = TLI.getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) : (Type *)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); std::pair CallResult = TLI.LowerCallTo(CLI); if (isF64) // Returned in xmm0 and xmm1. return CallResult.first; // Returned in bits 0:31 and 32:64 xmm0. SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(0, dl)); SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(1, dl)); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes = false) { // Check if InOp already has the right width. MVT InVT = InOp.getSimpleValueType(); if (InVT == NVT) return InOp; if (InOp.isUndef()) return DAG.getUNDEF(NVT); assert(InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match"); unsigned InNumElts = InVT.getVectorNumElements(); unsigned WidenNumElts = NVT.getVectorNumElements(); assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"); SDLoc dl(InOp); if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { SDValue N1 = InOp.getOperand(1); if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || N1.isUndef()) { InOp = InOp.getOperand(0); InVT = InOp.getSimpleValueType(); InNumElts = InVT.getVectorNumElements(); } } if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { SmallVector Ops; for (unsigned i = 0; i < InNumElts; ++i) Ops.push_back(InOp.getOperand(i)); EVT EltVT = InOp.getOperand(0).getValueType(); SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) Ops.push_back(FillVal); return DAG.getBuildVector(NVT, dl, Ops); } SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedScatterSDNode *N = cast(Op.getNode()); SDValue Src = N->getValue(); MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); MVT MemVT = N->getMemoryVT().getSimpleVT(); MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { // The v2i32 value was promoted to v2i64. // Now we "redo" the type legalizer's work and widen the original // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 // with a shuffle. assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && "Unexpected memory type"); int ShuffleMask[] = {0, 2, -1, -1}; Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), DAG.getUNDEF(MVT::v4i32), ShuffleMask); // Now we have 4 elements instead of 2. // Expand the index. MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); Index = ExtendToType(Index, NewIndexVT, DAG); // Expand the mask with zeroes // Mask may be <2 x i64> or <2 x i1> at this moment assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && "Unexpected mask type"); MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); VT = MVT::v4i32; } unsigned NumElts = VT.getVectorNumElements(); if (!Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but // the vector contains 8 elements, we just sign-extend the index if (IndexVT == MVT::v8i32) // Just extend index Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); else { // The minimal number of elts in scatter is 8 NumElts = 8; // Index MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); // Use original index here, do not modify the index twice Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); if (IndexVT.getScalarType() == MVT::i32) Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); // Mask // At this point we have promoted mask operand assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); // Use the original mask here, do not modify the mask twice Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); // The value that should be stored MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); Src = ExtendToType(Src, NewVT, DAG); } } // If the mask is "wide" at this point - truncate it to i1 vector MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); // The mask is killed by scatter, add it to the values SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; SDValue NewScatter = DAG.getTargetMemSDNode( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MaskedLoadSDNode *N = cast(Op.getNode()); MVT VT = Op.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); SDLoc dl(Op); assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"); assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"); // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of // VLX. These types for exp-loads are handled here. if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4) return Op; assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked load op."); assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."); // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); SDValue Src0 = N->getSrc0(); Src0 = ExtendToType(Src0, WideDataVT, DAG); // Mask element has to be i1. MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case"); MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); if (MaskEltTy != MVT::i1) Mask = DAG.getNode(ISD::TRUNCATE, dl, MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask); SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), N->getBasePtr(), Mask, Src0, N->getMemoryVT(), N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad()); SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), DAG.getIntPtrConstant(0, dl)); SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MaskedStoreSDNode *N = cast(Op.getNode()); SDValue DataToStore = N->getValue(); MVT VT = DataToStore.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); SDLoc dl(Op); assert((!N->isCompressingStore() || Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"); assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"); // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX. if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4) return Op; assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked store op."); assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."); // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); // Mask element has to be i1. MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && "We handle 4x32, 4x64 and 2x64 vectors only in this case"); MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); if (MaskEltTy != MVT::i1) Mask = DAG.getNode(ISD::TRUNCATE, dl, MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask); return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), N->isTruncatingStore(), N->isCompressingStore()); } static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue Src0 = N->getValue(); MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); // If the index is v2i32, we're being called by type legalization. if (IndexVT == MVT::v2i32) return SDValue(); if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but // the vector contains 8 elements, we just sign-extend the index if (NumElts == 8) { Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; SDValue NewGather = DAG.getTargetMemSDNode( DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(), N->getMemOperand()); return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl); } // Minimal number of elements in Gather NumElts = 8; // Index MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); Index = ExtendToType(Index, NewIndexVT, DAG); if (IndexVT.getScalarType() == MVT::i32) Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); // Mask MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); // At this point we have promoted mask operand assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); // The pass-through value MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); Src0 = ExtendToType(Src0, NewVT, DAG); SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; SDValue NewGather = DAG.getTargetMemSDNode( DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(), N->getMemOperand()); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewGather.getValue(0), DAG.getIntPtrConstant(0, dl)); SDValue RetOps[] = {Extract, NewGather.getValue(2)}; return DAG.getMergeValues(RetOps, dl); } SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; SDValue NewGather = DAG.getTargetMemSDNode( DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(), N->getMemOperand()); return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl); } SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const { // TODO: Eventually, the lowering of these nodes should be informed by or // deferred to the GC strategy for the function in which they appear. For // now, however, they must be lowered to something. Since they are logically // no-ops in the case of a null GC strategy (or a GC strategy which does not // require special handling for these nodes), lower them as literal NOOPs for // the time being. SmallVector Ops; Ops.push_back(Op.getOperand(0)); if (Op->getGluedNode()) Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); SDLoc OpDL(Op); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); return NOOP; } SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const { // TODO: Eventually, the lowering of these nodes should be informed by or // deferred to the GC strategy for the function in which they appear. For // now, however, they must be lowered to something. Since they are logically // no-ops in the case of a null GC strategy (or a GC strategy which does not // require special handling for these nodes), lower them as literal NOOPs for // the time being. SmallVector Ops; Ops.push_back(Op.getOperand(0)); if (Op->getGluedNode()) Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); SDLoc OpDL(Op); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); return NOOP; } /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::SHL_PARTS: case ISD::SRA_PARTS: case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::EH_SJLJ_SETUP_DISPATCH: return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::MULHS: case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); case ISD::ROTL: case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::ADD: case ISD::SUB: return LowerADD_SUB(Op, DAG); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, DAG); case ISD::ABS: return LowerABS(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: return LowerGC_TRANSITION_START(Op, DAG); case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG); } } /// Places new result values for the node in Results (their number /// and types must exactly match those of the original return values of /// the node), or leaves Results empty, which indicates that the node is not /// to be custom lowered after all. void X86TargetLowering::LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDValue Res = LowerOperation(SDValue(N, 0), DAG); if (!Res.getNode()) return; assert((N->getNumValues() <= Res->getNumValues()) && "Lowering returned the wrong number of results!"); // Places new result values base on N result number. // In some cases (LowerSINT_TO_FP for example) Res has more result values // than original node, chain should be dropped(last value). for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) Results.push_back(Res.getValue(I)); } /// Replace a node with an illegal result type with a new node built out of /// custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const { SDLoc dl(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); case X86ISD::AVG: { // Legalize types for X86ISD::AVG by expanding vectors. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); auto InVT = N->getValueType(0); auto InVTSize = InVT.getSizeInBits(); const unsigned RegSize = (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; assert((Subtarget.hasBWI() || RegSize < 512) && "512-bit vector requires AVX512BW"); assert((Subtarget.hasAVX2() || RegSize < 256) && "256-bit vector requires AVX2"); auto ElemVT = InVT.getVectorElementType(); auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, RegSize / ElemVT.getSizeInBits()); assert(RegSize % InVT.getSizeInBits() == 0); unsigned NumConcat = RegSize / InVT.getSizeInBits(); SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); Ops[0] = N->getOperand(0); SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); Ops[0] = N->getOperand(1); SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); if (!ExperimentalVectorWideningLegalization) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMAX: { EVT VT = N->getValueType(0); assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); SDValue UNDEF = DAG.getUNDEF(VT); SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), UNDEF); SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(1), UNDEF); Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); return; } case ISD::SDIV: case ISD::UDIV: case ISD::SREM: case ISD::UREM: case ISD::SDIVREM: case ISD::UDIVREM: { SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); Results.push_back(V); return; } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; if (N->getValueType(0) == MVT::v2i32) { assert((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); SDValue Src = N->getOperand(0); if (Src.getValueType() == MVT::v2f64) { MVT ResVT = MVT::v4i32; unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; if (!IsSigned && !Subtarget.hasVLX()) { // Widen to 512-bits. ResVT = MVT::v8i32; Opc = ISD::FP_TO_UINT; Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, DAG.getUNDEF(MVT::v8f64), Src, DAG.getIntPtrConstant(0, dl)); } SDValue Res = DAG.getNode(Opc, dl, ResVT, Src); ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32 : MVT::v2i32; Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } if (Src.getValueType() == MVT::v2f32) { SDValue Idx = DAG.getIntPtrConstant(0, dl); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32)); Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); if (!ExperimentalVectorWideningLegalization) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs, // so early out here. return; } std::pair Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; if (FIST.getNode()) { EVT VT = N->getValueType(0); // Return a load from the stack slot. if (StackSlot.getNode()) Results.push_back( DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo())); else Results.push_back(FIST); } return; } case ISD::SINT_TO_FP: { assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); SDValue Src = N->getOperand(0); if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64) return; Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src)); return; } case ISD::UINT_TO_FP: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); if (VT != MVT::v2f32) return; SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) { Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src)); return; } if (SrcVT != MVT::v2i32) return; SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src); SDValue VBias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); return; } case ISD::FP_ROUND: { if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) return; SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); Results.push_back(V); return; } case ISD::FP_EXTEND: { // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. // No other ValueType for FP_EXTEND should reach this point. assert(N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"); return; } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); case Intrinsic::x86_rdtsc: return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); case Intrinsic::x86_rdtscp: return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, Results); case Intrinsic::x86_rdpmc: return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); case Intrinsic::x86_xgetbv: return getExtendedControlRegister(N, dl, DAG, Subtarget, Results); } } case ISD::INTRINSIC_WO_CHAIN: { if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG)) Results.push_back(V); return; } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(0, dl, HalfT)); cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(1, dl, HalfT)); cpInL = DAG.getCopyToReg(N->getOperand(0), dl, Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue()); cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX, cpInH, cpInL.getValue(1)); SDValue swapInL, swapInH; swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(0, dl, HalfT)); swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(1, dl, HalfT)); swapInH = DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, swapInH, cpInH.getValue(1)); // If the current function needs the base pointer, RBX, // we shouldn't use cmpxchg directly. // Indeed the lowering of that instruction will clobber // that register and since RBX will be a reserved register // the register allocator will not make sure its value will // be properly saved and restored around this live-range. const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); unsigned BasePtr = TRI->getBaseRegister(); MachineMemOperand *MMO = cast(N)->getMemOperand(); if (TRI->hasBasePointer(DAG.getMachineFunction()) && (BasePtr == X86::RBX || BasePtr == X86::EBX)) { // ISel prefers the LCMPXCHG64 variant. // If that assert breaks, that means it is not the case anymore, // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX, // not just EBX. This is a matter of accepting i64 input for that // pseudo, and restoring into the register of the right wide // in expand pseudo. Everything else should just work. assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && "Saving only half of the RBX"); unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG : X86ISD::LCMPXCHG8_SAVE_EBX_DAG; SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl, Regs64bit ? X86::RBX : X86::EBX, HalfT, swapInH.getValue(1)); SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL, RBXSave, /*Glue*/ RBXSave.getValue(2)}; Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); } else { unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, Regs64bit ? X86::RBX : X86::EBX, swapInL, swapInH.getValue(1)); SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), swapInL.getValue(1)}; Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); } SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, MVT::i32, cpOutH.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG); Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); Results.push_back(Success); Results.push_back(EFLAGS.getValue(1)); return; } case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; } case ISD::BITCAST: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); EVT SrcVT = N->getOperand(0).getValueType(); if (SrcVT != MVT::f64 || (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) return; unsigned NumElts = DstVT.getVectorNumElements(); EVT SVT = DstVT.getVectorElementType(); EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded); if (ExperimentalVectorWideningLegalization) { // If we are legalizing vectors by widening, we already have the desired // legal vector type, just return it. Results.push_back(ToVecInt); return; } SmallVector Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, ToVecInt, DAG.getIntPtrConstant(i, dl))); Results.push_back(DAG.getBuildVector(DstVT, dl, Elts)); return; } case ISD::MGATHER: { EVT VT = N->getValueType(0); if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); if (Index.getValueType() != MVT::v2i64) return; SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Gather->getValue(), DAG.getUNDEF(MVT::v2f32)); if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, DAG.getUNDEF(MVT::v2i1)); Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); } SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), Index }; SDValue Res = DAG.getTargetMemSDNode( DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl, Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(2)); return; } if (VT == MVT::v2i32) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Gather->getValue(), DAG.getUNDEF(MVT::v2i32)); // If the index is v2i64 we can use it directly. if (Index.getValueType() == MVT::v2i64 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, DAG.getUNDEF(MVT::v2i1)); Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); } SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), Index }; SDValue Res = DAG.getTargetMemSDNode( DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, Gather->getMemoryVT(), Gather->getMemOperand()); SDValue Chain = Res.getValue(2); if (!ExperimentalVectorWideningLegalization) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); Results.push_back(Chain); return; } EVT IndexVT = Index.getValueType(); EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), IndexVT.getScalarType(), 4); // Otherwise we need to custom widen everything to avoid promotion. Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, DAG.getUNDEF(IndexVT)); Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, DAG.getConstant(0, dl, MVT::v2i1)); SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), Index }; SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), Gather->getMemoryVT(), dl, Ops, Gather->getMemOperand()); SDValue Chain = Res.getValue(1); if (!ExperimentalVectorWideningLegalization) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); Results.push_back(Chain); return; } break; } } } const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((X86ISD::NodeType)Opcode) { case X86ISD::FIRST_NUMBER: break; case X86ISD::BSF: return "X86ISD::BSF"; case X86ISD::BSR: return "X86ISD::BSR"; case X86ISD::SHLD: return "X86ISD::SHLD"; case X86ISD::SHRD: return "X86ISD::SHRD"; case X86ISD::FAND: return "X86ISD::FAND"; case X86ISD::FANDN: return "X86ISD::FANDN"; case X86ISD::FOR: return "X86ISD::FOR"; case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FILD: return "X86ISD::FILD"; case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; case X86ISD::FLD: return "X86ISD::FLD"; case X86ISD::FST: return "X86ISD::FST"; case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; case X86ISD::CMPMU: return "X86ISD::CMPMU"; case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::FSETCCM: return "X86ISD::FSETCCM"; case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; case X86ISD::IRET: return "X86ISD::IRET"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; case X86ISD::ADDUS: return "X86ISD::ADDUS"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAXS: return "X86ISD::FMAXS"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FMINS: return "X86ISD::FMINS"; case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; case X86ISD::EH_SJLJ_SETUP_DISPATCH: return "X86ISD::EH_SJLJ_SETUP_DISPATCH"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; case X86ISD::LCMPXCHG8_SAVE_EBX_DAG: return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG"; case X86ISD::LCMPXCHG16_SAVE_RBX_DAG: return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG"; case X86ISD::LADD: return "X86ISD::LADD"; case X86ISD::LSUB: return "X86ISD::LSUB"; case X86ISD::LOR: return "X86ISD::LOR"; case X86ISD::LXOR: return "X86ISD::LXOR"; case X86ISD::LAND: return "X86ISD::LAND"; case X86ISD::LINC: return "X86ISD::LINC"; case X86ISD::LDEC: return "X86ISD::LDEC"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES"; case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND"; case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; case X86ISD::VSRL: return "X86ISD::VSRL"; case X86ISD::VSRA: return "X86ISD::VSRA"; case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; case X86ISD::VSRAV: return "X86ISD::VSRAV"; case X86ISD::VROTLI: return "X86ISD::VROTLI"; case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::VPPERM: return "X86ISD::VPPERM"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; case X86ISD::SBB: return "X86ISD::SBB"; case X86ISD::SMUL: return "X86ISD::SMUL"; case X86ISD::UMUL: return "X86ISD::UMUL"; case X86ISD::SMUL8: return "X86ISD::SMUL8"; case X86ISD::UMUL8: return "X86ISD::UMUL8"; case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; case X86ISD::INC: return "X86ISD::INC"; case X86ISD::DEC: return "X86ISD::DEC"; case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL"; case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::VALIGN: return "X86ISD::VALIGN"; case X86ISD::VSHLD: return "X86ISD::VSHLD"; case X86ISD::VSHRD: return "X86ISD::VSHRD"; case X86ISD::VSHLDV: return "X86ISD::VSHLDV"; case X86ISD::VSHRDV: return "X86ISD::VSHRDV"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; case X86ISD::SHUFP: return "X86ISD::SHUFP"; case X86ISD::SHUF128: return "X86ISD::SHUF128"; case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; case X86ISD::MOVSD: return "X86ISD::MOVSD"; case X86ISD::MOVSS: return "X86ISD::MOVSS"; case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND"; case X86ISD::VRANGES: return "X86ISD::VRANGES"; case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::MFENCE: return "X86ISD::MFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; case X86ISD::VPSHA: return "X86ISD::VPSHA"; case X86ISD::VPSHL: return "X86ISD::VPSHL"; case X86ISD::VPCOM: return "X86ISD::VPCOM"; case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND"; case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND"; case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND"; case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; case X86ISD::FMADDS1: return "X86ISD::FMADDS1"; case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1"; case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1"; case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1"; case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND"; case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND"; case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND"; case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND"; case X86ISD::FMADDS3: return "X86ISD::FMADDS3"; case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3"; case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3"; case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3"; case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND"; case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND"; case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND"; case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND"; case X86ISD::FMADD4S: return "X86ISD::FMADD4S"; case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S"; case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S"; case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S"; case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND"; case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND"; case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND"; case X86ISD::VREDUCES: return "X86ISD::VREDUCES"; case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND"; case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND"; case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; case X86ISD::EXPAND: return "X86ISD::EXPAND"; case X86ISD::SELECT: return "X86ISD::SELECT"; case X86ISD::SELECTS: return "X86ISD::SELECTS"; case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; case X86ISD::RCP14: return "X86ISD::RCP14"; case X86ISD::RCP14S: return "X86ISD::RCP14S"; case X86ISD::RCP28: return "X86ISD::RCP28"; case X86ISD::RCP28S: return "X86ISD::RCP28S"; case X86ISD::EXP2: return "X86ISD::EXP2"; case X86ISD::RSQRT14: return "X86ISD::RSQRT14"; case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; case X86ISD::ADDS: return "X86ISD::ADDS"; case X86ISD::SUBS: return "X86ISD::SUBS"; case X86ISD::AVG: return "X86ISD::AVG"; case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI"; case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI"; case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND"; case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND"; case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND"; case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND"; case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P"; case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT"; case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND"; case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND"; case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND"; case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; case X86ISD::LWPINS: return "X86ISD::LWPINS"; case X86ISD::MGATHER: return "X86ISD::MGATHER"; case X86ISD::MSCATTER: return "X86ISD::MSCATTER"; case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD"; case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS"; case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD"; case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS"; case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB"; case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB"; case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB"; case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB"; } return nullptr; } /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) return false; if (AM.BaseGV) { unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV); // If a reference to this global requires an extra load, we can't fold it. if (isGlobalStubReference(GVFlags)) return false; // If BaseGV requires a register for the PIC base, we cannot also have a // BaseReg specified. if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) return false; // If lower 4G is not available, then we must use rip-relative addressing. if ((M != CodeModel::Small || isPositionIndependent()) && Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1)) return false; } switch (AM.Scale) { case 0: case 1: case 2: case 4: case 8: // These scales always work. break; case 3: case 5: case 9: // These scales are formed with basereg+scalereg. Only accept if there is // no basereg yet. if (AM.HasBaseReg) return false; break; default: // Other stuff never works. return false; } return true; } bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { unsigned Bits = Ty->getScalarSizeInBits(); // 8-bit shifts are always expensive, but versions with a scalar amount aren't // particularly cheaper than those without. if (Bits == 8) return false; // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable // shifts just as cheap as scalar ones. if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64)) return false; // Otherwise, it's significantly cheaper to shift by a scalar amount than by a // fully general vector. return true; } bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; if (!isTypeLegal(EVT::getEVT(Ty1))) return false; assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); // Assuming the caller doesn't have a zeroext or signext return parameter, // truncation all the way down to i1 is valid. return true; } bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<32>(Imm); } bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { // Can also use sub to handle negated immediates. return isInt<32>(Imm); } bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (!VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit(); } bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit(); } bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { EVT VT1 = Val.getValueType(); if (isZExtFree(VT1, VT2)) return true; if (Val.getOpcode() != ISD::LOAD) return false; if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() || !VT2.isInteger()) return false; switch (VT1.getSimpleVT().SimpleTy) { default: break; case MVT::i8: case MVT::i16: case MVT::i32: // X86 has 8, 16, and 32-bit zero-extending loads. return true; } return false; } bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { if (!Subtarget.hasAnyFMA()) return false; VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; default: break; } return false; } bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(VT1 == MVT::i32 && VT2 == MVT::i16); } /// Targets can use this to indicate that they only support *some* /// VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. bool X86TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { if (!VT.isSimple()) return false; // Not for i1 vectors if (VT.getSimpleVT().getScalarType() == MVT::i1) return false; // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSimpleVT().getSizeInBits() == 64) return false; // We only care that the types being shuffled are legal. The lowering can // handle any possible shuffle mask that results. return isTypeLegal(VT.getSimpleVT()); } bool X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, EVT VT) const { // Just delegate to the generic legality, clear masks aren't special. return isShuffleMaskLegal(Mask, VT); } //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { DebugLoc DL = MI.getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // For the v = xbegin(), we generate // // thisMBB: // xbegin sinkMBB // // mainMBB: // s0 = -1 // // fallBB: // eax = # XABORT_DEF // s1 = eax // // sinkMBB: // v = phi(s0/mainBB, s1/fallBB) MachineBasicBlock *thisMBB = MBB; MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, fallMBB); MF->insert(I, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned fallDstReg = MRI.createVirtualRegister(RC); // thisMBB: // xbegin fallMBB // # fallthrough to mainMBB // # abortion to fallMBB BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(fallMBB); // mainMBB: // mainDstReg := -1 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1); BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); mainMBB->addSuccessor(sinkMBB); // fallMBB: // ; pseudo instruction to model hardware's definition from XABORT // EAX := XABORT_DEF // fallDstReg := EAX BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF)); BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg) .addReg(X86::EAX); fallMBB->addSuccessor(sinkMBB); // sinkMBB: // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB) BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) .addReg(mainDstReg).addMBB(mainMBB) .addReg(fallDstReg).addMBB(fallMBB); MI.eraseFromParent(); return sinkMBB; } // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 // or XMM0_V32I8 in AVX all of this code can be replaced with that // in the .td file. static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB, const TargetInstrInfo *TII) { unsigned Opc; switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; } DebugLoc dl = MI.getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); unsigned NumArgs = MI.getNumOperands(); for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.add(Op); } if (MI.hasOneMemOperand()) MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(X86::XMM0); MI.eraseFromParent(); return BB; } // FIXME: Custom handling because TableGen doesn't support multiple implicit // defs in an instruction pattern static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB, const TargetInstrInfo *TII) { unsigned Opc; switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; } DebugLoc dl = MI.getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); unsigned NumArgs = MI.getNumOperands(); // remove the results for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.add(Op); } if (MI.hasOneMemOperand()) MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(X86::ECX); MI.eraseFromParent(); return BB; } static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB, const X86Subtarget &Subtarget) { DebugLoc dl = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // insert input VAL into EAX BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) .addReg(MI.getOperand(0).getReg()); // insert zero to ECX BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); // insert zero to EDX BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX); // insert WRPKRU instruction BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); MI.eraseFromParent(); // The pseudo is gone now. return BB; } static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB, const X86Subtarget &Subtarget) { DebugLoc dl = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // insert zero to ECX BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX); // insert RDPKRU instruction BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(X86::EAX); MI.eraseFromParent(); // The pseudo is gone now. return BB; } static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, const X86Subtarget &Subtarget, unsigned Opc) { DebugLoc dl = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // Address into RAX/EAX, other two args into ECX, EDX. unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); for (int i = 0; i < X86::AddrNumOperands; ++i) MIB.add(MI.getOperand(i)); unsigned ValOps = X86::AddrNumOperands; BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) .addReg(MI.getOperand(ValOps).getReg()); BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) .addReg(MI.getOperand(ValOps + 1).getReg()); // The instruction doesn't actually take any operands though. BuildMI(*BB, MI, dl, TII->get(Opc)); MI.eraseFromParent(); // The pseudo is gone now. return BB; } static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB, const X86Subtarget &Subtarget) { DebugLoc dl = MI->getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // Address into RAX/EAX unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); for (int i = 0; i < X86::AddrNumOperands; ++i) MIB.add(MI->getOperand(i)); // The instruction doesn't actually take any operands though. BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr)); MI->eraseFromParent(); // The pseudo is gone now. return BB; } MachineBasicBlock * X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. // Operands to this pseudo-instruction: // 0 ) Output : destination address (reg) // 1-5) Input : va_list address (addr, i64mem) // 6 ) ArgSize : Size (in bytes) of vararg type // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset // 8 ) Align : Alignment of type // 9 ) EFLAGS (implicit-def) assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); static_assert(X86::AddrNumOperands == 5, "VAARG_64 assumes 5 address operands"); unsigned DestReg = MI.getOperand(0).getReg(); MachineOperand &Base = MI.getOperand(1); MachineOperand &Scale = MI.getOperand(2); MachineOperand &Index = MI.getOperand(3); MachineOperand &Disp = MI.getOperand(4); MachineOperand &Segment = MI.getOperand(5); unsigned ArgSize = MI.getOperand(6).getImm(); unsigned ArgMode = MI.getOperand(7).getImm(); unsigned Align = MI.getOperand(8).getImm(); // Memory Reference assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); // Machine Information const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); DebugLoc DL = MI.getDebugLoc(); // struct va_list { // i32 gp_offset // i32 fp_offset // i64 overflow_area (address) // i64 reg_save_area (address) // } // sizeof(va_list) = 24 // alignment(va_list) = 8 unsigned TotalNumIntRegs = 6; unsigned TotalNumXMMRegs = 8; bool UseGPOffset = (ArgMode == 1); bool UseFPOffset = (ArgMode == 2); unsigned MaxOffset = TotalNumIntRegs * 8 + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); /* Align ArgSize to a multiple of 8 */ unsigned ArgSizeA8 = (ArgSize + 7) & ~7; bool NeedsAlign = (Align > 8); MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *overflowMBB; MachineBasicBlock *offsetMBB; MachineBasicBlock *endMBB; unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB unsigned OffsetReg = 0; if (!UseGPOffset && !UseFPOffset) { // If we only pull from the overflow region, we don't create a branch. // We don't need to alter control flow. OffsetDestReg = 0; // unused OverflowDestReg = DestReg; offsetMBB = nullptr; overflowMBB = thisMBB; endMBB = thisMBB; } else { // First emit code to check if gp_offset (or fp_offset) is below the bound. // If so, pull the argument from reg_save_area. (branch to offsetMBB) // If not, pull from overflow_area. (branch to overflowMBB) // // thisMBB // | . // | . // offsetMBB overflowMBB // | . // | . // endMBB // Registers for the PHI in endMBB OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *MF = MBB->getParent(); overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator MBBIter = ++MBB->getIterator(); // Insert the new basic blocks MF->insert(MBBIter, offsetMBB); MF->insert(MBBIter, overflowMBB); MF->insert(MBBIter, endMBB); // Transfer the remainder of MBB and its successor edges to endMBB. endMBB->splice(endMBB->begin(), thisMBB, std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); // Make offsetMBB and overflowMBB successors of thisMBB thisMBB->addSuccessor(offsetMBB); thisMBB->addSuccessor(overflowMBB); // endMBB is a successor of both offsetMBB and overflowMBB offsetMBB->addSuccessor(endMBB); overflowMBB->addSuccessor(endMBB); // Load the offset value into a register OffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) .setMemRefs(MMOBegin, MMOEnd); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) .addReg(OffsetReg) .addImm(MaxOffset + 8 - ArgSizeA8); // Branch to "overflowMBB" if offset >= max // Fall through to "offsetMBB" otherwise BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) .addMBB(overflowMBB); } // In offsetMBB, emit code to use the reg_save_area. if (offsetMBB) { assert(OffsetReg != 0); // Read the reg_save_area address. unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, 16) .add(Segment) .setMemRefs(MMOBegin, MMOEnd); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) .addImm(0) .addReg(OffsetReg) .addImm(X86::sub_32bit); // Add the offset to the reg_save_area to get the final address. BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) .addReg(OffsetReg64) .addReg(RegSaveReg); // Compute the offset for the next argument unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) .addReg(OffsetReg) .addImm(UseFPOffset ? 16 : 8); // Store it back into the va_list. BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) .addReg(NextOffsetReg) .setMemRefs(MMOBegin, MMOEnd); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) .addMBB(endMBB); } // // Emit code to use overflow area // // Load the overflow_area address into a register. unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, 8) .add(Segment) .setMemRefs(MMOBegin, MMOEnd); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. if (NeedsAlign) { // Align the overflow address assert(isPowerOf2_32(Align) && "Alignment must be a power of 2"); unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) .addReg(OverflowAddrReg) .addImm(Align-1); BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) .addReg(TmpReg) .addImm(~(uint64_t)(Align-1)); } else { BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) .addReg(OverflowAddrReg); } // Compute the next overflow address after this argument. // (the overflow address should be kept 8-byte aligned) unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) .addReg(OverflowDestReg) .addImm(ArgSizeA8); // Store the new overflow address. BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, 8) .add(Segment) .addReg(NextAddrReg) .setMemRefs(MMOBegin, MMOEnd); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { BuildMI(*endMBB, endMBB->begin(), DL, TII->get(X86::PHI), DestReg) .addReg(OffsetDestReg).addMBB(offsetMBB) .addReg(OverflowDestReg).addMBB(overflowMBB); } // Erase the pseudo instruction MI.eraseFromParent(); return endMBB; } MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MachineInstr &MI, MachineBasicBlock *MBB) const { // Emit code to save XMM registers to the stack. The ABI says that the // number of registers to save is given in %al, so it's theoretically // possible to do an indirect jump trick to avoid saving all of them, // however this code takes a simpler approach and just executes all // of the stores if %al is non-zero. It's less code, and it's probably // easier on the hardware branch predictor, and stores aren't all that // expensive anyway. // Create the new basic blocks. One block contains all the XMM stores, // and one block is the final destination regardless of whether any // stores were performed. const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *F = MBB->getParent(); MachineFunction::iterator MBBIter = ++MBB->getIterator(); MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, XMMSaveMBB); F->insert(MBBIter, EndMBB); // Transfer the remainder of MBB and its successor edges to EndMBB. EndMBB->splice(EndMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); EndMBB->transferSuccessorsAndUpdatePHIs(MBB); // The original block will now fall through to the XMM save block. MBB->addSuccessor(XMMSaveMBB); // The XMMSaveMBB will fall through to the end block. XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); unsigned CountReg = MI.getOperand(0).getReg(); int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); MBB->addSuccessor(EndMBB); } // Make sure the last operand is EFLAGS, which gets clobbered by the branch // that was just emitted, but clearly shouldn't be "saved". assert((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"); unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; MachineMemOperand *MMO = F->getMachineMemOperand( MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) .addFrameIndex(RegSaveFrameIndex) .addImm(/*Scale=*/1) .addReg(/*IndexReg=*/0) .addImm(/*Disp=*/Offset) .addReg(/*Segment=*/0) .addReg(MI.getOperand(i).getReg()) .addMemOperand(MMO); } MI.eraseFromParent(); // The pseudo instruction is gone now. return EndMBB; } // The EFLAGS operand of SelectItr might be missing a kill marker // because there were multiple uses of EFLAGS, and ISel didn't know // which to mark. Figure out whether SelectItr should have had a // kill marker, and set it if it should. Returns the correct kill // marker value. static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { // Scan forward through BB for a use/def of EFLAGS. MachineBasicBlock::iterator miI(std::next(SelectItr)); for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { const MachineInstr& mi = *miI; if (mi.readsRegister(X86::EFLAGS)) return false; if (mi.definesRegister(X86::EFLAGS)) break; // Should have kill-flag - update below. } // If we hit the end of the block, check whether EFLAGS is live into a // successor. if (miI == BB->end()) { for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), sEnd = BB->succ_end(); sItr != sEnd; ++sItr) { MachineBasicBlock* succ = *sItr; if (succ->isLiveIn(X86::EFLAGS)) return false; } } // We found a def, or hit the end of the basic block and EFLAGS wasn't live // out. SelectMI should have a kill flag on EFLAGS. SelectItr->addRegisterKilled(X86::EFLAGS, TRI); return true; } // Return true if it is OK for this CMOV pseudo-opcode to be cascaded // together with other CMOV pseudo-opcodes into a single basic-block with // conditional jump around it. static bool isCMOVPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { case X86::CMOV_FR32: case X86::CMOV_FR64: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: case X86::CMOV_V2F64: case X86::CMOV_V2I64: case X86::CMOV_V4F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: case X86::CMOV_V8F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: case X86::CMOV_V8I1: case X86::CMOV_V16I1: case X86::CMOV_V32I1: case X86::CMOV_V64I1: return true; default: return false; } } // Helper function, which inserts PHI functions into SinkMBB: // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ], // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for // the last PHI function inserted. static MachineInstrBuilder createPHIsForCMOVsInSinkBB( MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB) { MachineFunction *MF = TrueMBB->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); DebugLoc DL = MIItBegin->getDebugLoc(); X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); // As we are creating the PHIs, we have to be careful if there is more than // one. Later CMOVs may reference the results of earlier CMOVs, but later // PHIs have to reference the individual true/false inputs from earlier PHIs. // That also means that PHI construction must work forward from earlier to // later, and that the code must maintain a mapping from earlier PHI's // destination registers, and the registers that went into the PHI. DenseMap> RegRewriteTable; MachineInstrBuilder MIB; for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { unsigned DestReg = MIIt->getOperand(0).getReg(); unsigned Op1Reg = MIIt->getOperand(1).getReg(); unsigned Op2Reg = MIIt->getOperand(2).getReg(); // If this CMOV we are generating is the opposite condition from // the jump we generated, then we have to swap the operands for the // PHI that is going to be generated. if (MIIt->getOperand(3).getImm() == OppCC) std::swap(Op1Reg, Op2Reg); if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) Op1Reg = RegRewriteTable[Op1Reg].first; if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) Op2Reg = RegRewriteTable[Op2Reg].second; MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) .addReg(Op1Reg) .addMBB(FalseMBB) .addReg(Op2Reg) .addMBB(TrueMBB); // Add this PHI to the rewrite table. RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); } return MIB; } // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2). MachineBasicBlock * X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, MachineInstr &SecondCascadedCMOV, MachineBasicBlock *ThisMBB) const { const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = FirstCMOV.getDebugLoc(); // We lower cascaded CMOVs such as // // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2) // // to two successive branches. // // Without this, we would add a PHI between the two jumps, which ends up // creating a few copies all around. For instance, for // // (sitofp (zext (fcmp une))) // // we would generate: // // ucomiss %xmm1, %xmm0 // movss <1.0f>, %xmm0 // movaps %xmm0, %xmm1 // jne .LBB5_2 // xorps %xmm1, %xmm1 // .LBB5_2: // jp .LBB5_4 // movaps %xmm1, %xmm0 // .LBB5_4: // retq // // because this custom-inserter would have generated: // // A // | \ // | B // | / // C // | \ // | D // | / // E // // A: X = ...; Y = ... // B: empty // C: Z = PHI [X, A], [Y, B] // D: empty // E: PHI [X, C], [Z, D] // // If we lower both CMOVs in a single step, we can instead generate: // // A // | \ // | C // | /| // |/ | // | | // | D // | / // E // // A: X = ...; Y = ... // D: empty // E: PHI [X, A], [X, C], [Y, D] // // Which, in our sitofp/fcmp example, gives us something like: // // ucomiss %xmm1, %xmm0 // movss <1.0f>, %xmm0 // jne .LBB5_4 // jp .LBB5_4 // xorps %xmm0, %xmm0 // .LBB5_4: // retq // // We lower cascaded CMOV into two successive branches to the same block. // EFLAGS is used by both, so mark it as live in the second. const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); MachineFunction *F = ThisMBB->getParent(); MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator It = ++ThisMBB->getIterator(); F->insert(It, FirstInsertedMBB); F->insert(It, SecondInsertedMBB); F->insert(It, SinkMBB); // For a cascaded CMOV, we lower it to two successive branches to // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in // the FirstInsertedMBB. FirstInsertedMBB->addLiveIn(X86::EFLAGS); // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) { SecondInsertedMBB->addLiveIn(X86::EFLAGS); SinkMBB->addLiveIn(X86::EFLAGS); } // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. SinkMBB->splice(SinkMBB->begin(), ThisMBB, std::next(MachineBasicBlock::iterator(FirstCMOV)), ThisMBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); // Fallthrough block for ThisMBB. ThisMBB->addSuccessor(FirstInsertedMBB); // The true block target of the first branch is always SinkMBB. ThisMBB->addSuccessor(SinkMBB); // Fallthrough block for FirstInsertedMBB. FirstInsertedMBB->addSuccessor(SecondInsertedMBB); // The true block for the branch of FirstInsertedMBB. FirstInsertedMBB->addSuccessor(SinkMBB); // This is fallthrough. SecondInsertedMBB->addSuccessor(SinkMBB); // Create the conditional branch instructions. X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm()); unsigned Opc = X86::GetCondBranchFromCond(FirstCC); BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); X86::CondCode SecondCC = X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm()); unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC); BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB); // SinkMBB: // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] unsigned DestReg = FirstCMOV.getOperand(0).getReg(); unsigned Op1Reg = FirstCMOV.getOperand(1).getReg(); unsigned Op2Reg = FirstCMOV.getOperand(2).getReg(); MachineInstrBuilder MIB = BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg) .addReg(Op1Reg) .addMBB(SecondInsertedMBB) .addReg(Op2Reg) .addMBB(ThisMBB); // The second SecondInsertedMBB provides the same incoming value as the // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes). MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB); // Copy the PHI result to the register defined by the second CMOV. BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL, TII->get(TargetOpcode::COPY), SecondCascadedCMOV.getOperand(0).getReg()) .addReg(FirstCMOV.getOperand(0).getReg()); // Now remove the CMOVs. FirstCMOV.eraseFromParent(); SecondCascadedCMOV.eraseFromParent(); return SinkMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, MachineBasicBlock *ThisMBB) const { const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the // true/false values to select between and a branch opcode to use. // ThisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> FalseMBB // This code lowers all pseudo-CMOV instructions. Generally it lowers these // as described above, by inserting a BB, and then making a PHI at the join // point to select the true and false operands of the CMOV in the PHI. // // The code also handles two different cases of multiple CMOV opcodes // in a row. // // Case 1: // In this case, there are multiple CMOVs in a row, all which are based on // the same condition setting (or the exact opposite condition setting). // In this case we can lower all the CMOVs using a single inserted BB, and // then make a number of PHIs at the join point to model the CMOVs. The only // trickiness here, is that in a case like: // // t2 = CMOV cond1 t1, f1 // t3 = CMOV cond1 t2, f2 // // when rewriting this into PHIs, we have to perform some renaming on the // temps since you cannot have a PHI operand refer to a PHI result earlier // in the same block. The "simple" but wrong lowering would be: // // t2 = PHI t1(BB1), f1(BB2) // t3 = PHI t2(BB1), f2(BB2) // // but clearly t2 is not defined in BB1, so that is incorrect. The proper // renaming is to note that on the path through BB1, t2 is really just a // copy of t1, and do that renaming, properly generating: // // t2 = PHI t1(BB1), f1(BB2) // t3 = PHI t1(BB1), f2(BB2) // // Case 2: // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate // function - EmitLoweredCascadedSelect. X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineInstr *LastCMOV = &MI; MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); // Check for case 1, where there are multiple CMOVs with the same condition // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the // number of jumps the most. if (isCMOVPseudo(MI)) { // See if we have a string of CMOVS with the same condition. while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) && (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; ++NextMIIt; } } // This checks for case 2, but only do this if we didn't already find // case 1, as indicated by LastCMOV == MI. if (LastCMOV == &MI && NextMIIt != ThisMBB->end() && NextMIIt->getOpcode() == MI.getOpcode() && NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() && NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() && NextMIIt->getOperand(1).isKill()) { return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB); } const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); MachineFunction *F = ThisMBB->getParent(); MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator It = ++ThisMBB->getIterator(); F->insert(It, FalseMBB); F->insert(It, SinkMBB); // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); if (!LastCMOV->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) { FalseMBB->addLiveIn(X86::EFLAGS); SinkMBB->addLiveIn(X86::EFLAGS); } // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. SinkMBB->splice(SinkMBB->begin(), ThisMBB, std::next(MachineBasicBlock::iterator(LastCMOV)), ThisMBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); // Fallthrough block for ThisMBB. ThisMBB->addSuccessor(FalseMBB); // The true block target of the first (or only) branch is always a SinkMBB. ThisMBB->addSuccessor(SinkMBB); // Fallthrough block for FalseMBB. FalseMBB->addSuccessor(SinkMBB); // Create the conditional branch instruction. unsigned Opc = X86::GetCondBranchFromCond(CC); BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); // SinkMBB: // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ] // ... MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); MachineBasicBlock::iterator MIItEnd = std::next(MachineBasicBlock::iterator(LastCMOV)); createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB); // Now remove the CMOV(s). ThisMBB->erase(MIItBegin, MIItEnd); return SinkMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, MachineBasicBlock *BB) const { // Combine the following atomic floating-point modification pattern: // a.store(reg OP a.load(acquire), release) // Transform them into: // OPss (%gpr), %xmm // movss %xmm, (%gpr) // Or sd equivalent for 64-bit operations. unsigned MOp, FOp; switch (MI.getOpcode()) { default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); case X86::RELEASE_FADD32mr: FOp = X86::ADDSSrm; MOp = X86::MOVSSmr; break; case X86::RELEASE_FADD64mr: FOp = X86::ADDSDrm; MOp = X86::MOVSDmr; break; } const X86InstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); unsigned ValOpIdx = X86::AddrNumOperands; unsigned VSrc = MI.getOperand(ValOpIdx).getReg(); MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(FOp), MRI.createVirtualRegister(MRI.getRegClass(VSrc))) .addReg(VSrc); for (int i = 0; i < X86::AddrNumOperands; ++i) { MachineOperand &Operand = MI.getOperand(i); // Clear any kill flags on register operands as we'll create a second // instruction using the same address operands. if (Operand.isReg()) Operand.setIsKill(false); MIB.add(Operand); } MachineInstr *FOpMI = MIB; MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); for (int i = 0; i < X86::AddrNumOperands; ++i) MIB.add(MI.getOperand(i)); MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); const bool Is64Bit = Subtarget.is64Bit(); const bool IsLP64 = Subtarget.isTarget64BitLP64(); const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; // BB: // ... [Till the alloca] // If stacklet is not large enough, jump to mallocMBB // // bumpMBB: // Allocate by subtracting from RSP // Jump to continueMBB // // mallocMBB: // Allocate by call to runtime // // continueMBB: // ... // [rest of original BB] // MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(getPointerTy(MF->getDataLayout())); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI.getOperand(1).getReg(), physSPReg = IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = ++BB->getIterator(); MF->insert(MBBIter, bumpMBB); MF->insert(MBBIter, mallocMBB); MF->insert(MBBIter, continueMBB); continueMBB->splice(continueMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); continueMBB->transferSuccessorsAndUpdatePHIs(BB); // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); } else if (Is64Bit) { BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EDI, RegState::Implicit) .addReg(X86::EAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EAX, RegState::ImplicitDefine); } if (!Is64Bit) BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) .addImm(16); BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) .addReg(IsLP64 ? X86::RAX : X86::EAX); BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Set up the CFG correctly. BB->addSuccessor(bumpMBB); BB->addSuccessor(mallocMBB); mallocMBB->addSuccessor(continueMBB); bumpMBB->addSuccessor(continueMBB); // Take care of the PHI nodes. BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), MI.getOperand(0).getReg()) .addReg(mallocPtrVReg) .addMBB(mallocMBB) .addReg(bumpSPPtrVReg) .addMBB(bumpMBB); // Delete the original pseudo instruction. MI.eraseFromParent(); // And we're done. return continueMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); DebugLoc DL = MI.getDebugLoc(); assert(!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"); // Only 32-bit EH needs to worry about manually restoring stack pointers. if (!Subtarget.is32Bit()) return BB; // C++ EH creates a new target block to hold the restore code, and wires up // the new block to the return destination with a normal JMP_4. MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); assert(BB->succ_size() == 1); MF->insert(std::next(BB->getIterator()), RestoreMBB); RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RestoreMBB); MI.getOperand(0).setMBB(RestoreMBB); auto RestoreMBBI = RestoreMBB->begin(); BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const Constant *PerFn = MF->getFunction().getPersonalityFn(); bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); // Only 32-bit SEH requires special handling for catchpad. if (IsSEH && Subtarget.is32Bit()) { const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); } MI.eraseFromParent(); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, MachineBasicBlock *BB) const { // So, here we replace TLSADDR with the sequence: // adjust_stackdown -> TLSADDR -> adjust_stackup. // We need this because TLSADDR is lowered into calls // inside MC, therefore without the two markers shrink-wrapping // may push the prologue/epilogue pass them. const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); MachineFunction &MF = *BB->getParent(); // Emit CALLSEQ_START right before the instruction. unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); MachineInstrBuilder CallseqStart = BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); // Emit CALLSEQ_END right after the instruction. // We don't call erase from parent because we want to keep the // original instruction around. unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); MachineInstrBuilder CallseqEnd = BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0); BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const { // This is pretty easy. We're taking the value that we received from // our load from the relocation, sticking it in either RDI (x86-64) // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?"); assert(MI.getOperand(3).isGlobal() && "This should be a global"); // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = Subtarget.is64Bit() ? Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() : Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget.is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) .addReg(X86::RIP) .addImm(0) .addReg(0) .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, MI.getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); } else if (!isPositionIndependent()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(0) .addImm(0) .addReg(0) .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, MI.getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } else { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(TII->getGlobalBaseReg(F)) .addImm(0) .addReg(0) .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, MI.getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); unsigned DstReg; unsigned MemOpndSlot = 0; unsigned CurOp = 0; DstReg = MI.getOperand(CurOp++).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); (void)TRI; unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); MemOpndSlot = CurOp; MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); // For v = setjmp(buf), we generate // // thisMBB: // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB // SjLjSetup restoreMBB // // mainMBB: // v_main = 0 // // sinkMBB: // v = phi(main, restore) // // restoreMBB: // if base pointer being used, load it from frame // v_restore = 1 MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MF->push_back(restoreMBB); restoreMBB->setHasAddressTaken(); MachineInstrBuilder MIB; // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: unsigned PtrStoreOpc = 0; unsigned LabelReg = 0; const int64_t LabelOffset = 1 * PVT.getStoreSize(); bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && !isPositionIndependent(); // Prepare IP either in reg or imm. if (!UseImmLabel) { PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; const TargetRegisterClass *PtrRC = getRegClassFor(PVT); LabelReg = MRI.createVirtualRegister(PtrRC); if (Subtarget.is64Bit()) { MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) .addReg(X86::RIP) .addImm(0) .addReg(0) .addMBB(restoreMBB) .addReg(0); } else { const X86InstrInfo *XII = static_cast(TII); MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) .addReg(XII->getGlobalBaseReg(MF)) .addImm(0) .addReg(0) .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference()) .addReg(0); } } else PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; // Store IP MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset); else MIB.add(MI.getOperand(MemOpndSlot + i)); } if (!UseImmLabel) MIB.addReg(LabelReg); else MIB.addMBB(restoreMBB); MIB.setMemRefs(MMOBegin, MMOEnd); // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); // mainMBB: // EAX = 0 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); mainMBB->addSuccessor(sinkMBB); // sinkMBB: BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) .addReg(mainDstReg).addMBB(mainMBB) .addReg(restoreDstReg).addMBB(restoreMBB); // restoreMBB: if (RegInfo->hasBasePointer(*MF)) { const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); X86MachineFunctionInfo *X86FI = MF->getInfo(); X86FI->setRestoreBasePointer(MF); unsigned FramePtr = RegInfo->getFrameRegister(*MF); unsigned BasePtr = RegInfo->getBaseRegister(); unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .setMIFlag(MachineInstr::FrameSetup); } BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); restoreMBB->addSuccessor(sinkMBB); MI.eraseFromParent(); return sinkMBB; } MachineBasicBlock * X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); const TargetRegisterClass *RC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); MachineInstrBuilder MIB; const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t SPOffset = 2 * PVT.getStoreSize(); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; // Reload FP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) MIB.add(MI.getOperand(i)); MIB.setMemRefs(MMOBegin, MMOEnd); // Reload IP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), LabelOffset); else MIB.add(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload SP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), SPOffset); else MIB.add(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Jump BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); MI.eraseFromParent(); return MBB; } void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); unsigned Op = 0; unsigned VR = 0; bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && !isPositionIndependent(); if (UseImmLabel) { Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; } else { const TargetRegisterClass *TRC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; VR = MRI->createVirtualRegister(TRC); Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; if (Subtarget.is64Bit()) BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR) .addReg(X86::RIP) .addImm(1) .addReg(0) .addMBB(DispatchBB) .addReg(0); else BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR) .addReg(0) /* TII->getGlobalBaseReg(MF) */ .addImm(1) .addReg(0) .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference()) .addReg(0); } MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op)); addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36); if (UseImmLabel) MIB.addMBB(DispatchBB); else MIB.addReg(VR); } MachineBasicBlock * X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); int FI = MFI.getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're // associated with. DenseMap> CallSiteNumToLPad; unsigned MaxCSNum = 0; for (auto &MBB : *MF) { if (!MBB.isEHPad()) continue; MCSymbol *Sym = nullptr; for (const auto &MI : MBB) { if (MI.isDebugValue()) continue; assert(MI.isEHLabel() && "expected EH_LABEL"); Sym = MI.getOperand(0).getMCSymbol(); break; } if (!MF->hasCallSiteLandingPad(Sym)) continue; for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) { CallSiteNumToLPad[CSI].push_back(&MBB); MaxCSNum = std::max(MaxCSNum, CSI); } } // Get an ordered list of the machine basic blocks for the jump table. std::vector LPadList; SmallPtrSet InvokeBBs; LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) { for (auto &LP : CallSiteNumToLPad[CSI]) { LPadList.push_back(LP); InvokeBBs.insert(LP->pred_begin(), LP->pred_end()); } } assert(!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"); // Create the MBBs for the dispatch code. // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); DispatchBB->setIsEHPad(true); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); BuildMI(TrapBB, DL, TII->get(X86::TRAP)); DispatchBB->addSuccessor(TrapBB); MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); DispatchBB->addSuccessor(DispContBB); // Insert MBBs. MF->push_back(DispatchBB); MF->push_back(DispContBB); MF->push_back(TrapBB); // Insert code into the entry block that creates and registers the function // context. SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI); // Create the jump table and associated information unsigned JTE = getJumpTableEncoding(); MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE); unsigned MJTI = JTI->createJumpTableIndex(LPadList); const X86RegisterInfo &RI = TII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. if (RI.hasBasePointer(*MF)) { const bool FPIs64Bit = Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); X86MachineFunctionInfo *MFI = MF->getInfo(); MFI->setRestoreBasePointer(MF); unsigned FP = RI.getFrameRegister(*MF); unsigned BP = RI.getBaseRegister(); unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true, MFI->getRestoreBasePointerOffset()) .addRegMask(RI.getNoPreservedMask()); } else { BuildMI(DispatchBB, DL, TII->get(X86::NOOP)) .addRegMask(RI.getNoPreservedMask()); } // IReg is used as an index in a memory operand and therefore can't be SP unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass); addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI, Subtarget.is64Bit() ? 8 : 4); BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) .addReg(IReg) .addImm(LPadList.size()); BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB); if (Subtarget.is64Bit()) { unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass); unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); // leaq .LJTI0_0(%rip), BReg BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg) .addReg(X86::RIP) .addImm(1) .addReg(0) .addJumpTableIndex(MJTI) .addReg(0); // movzx IReg64, IReg BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64) .addImm(0) .addReg(IReg) .addImm(X86::sub_32bit); switch (JTE) { case MachineJumpTableInfo::EK_BlockAddress: // jmpq *(BReg,IReg64,8) BuildMI(DispContBB, DL, TII->get(X86::JMP64m)) .addReg(BReg) .addImm(8) .addReg(IReg64) .addImm(0) .addReg(0); break; case MachineJumpTableInfo::EK_LabelDifference32: { unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass); unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass); unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass); // movl (BReg,IReg64,4), OReg BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg) .addReg(BReg) .addImm(4) .addReg(IReg64) .addImm(0) .addReg(0); // movsx OReg64, OReg BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg); // addq BReg, OReg64, TReg BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg) .addReg(OReg64) .addReg(BReg); // jmpq *TReg BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg); break; } default: llvm_unreachable("Unexpected jump table encoding"); } } else { // jmpl *.LJTI0_0(,IReg,4) BuildMI(DispContBB, DL, TII->get(X86::JMP32m)) .addReg(0) .addImm(4) .addReg(IReg) .addJumpTableIndex(MJTI) .addReg(0); } // Add the jump table entries as successors to the MBB. SmallPtrSet SeenMBBs; for (auto &LP : LPadList) if (SeenMBBs.insert(LP).second) DispContBB->addSuccessor(LP); // N.B. the order the invoke BBs are processed in doesn't matter here. SmallVector MBBLPads; const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs(); for (MachineBasicBlock *MBB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. // Keep a copy of Successors since it's modified inside the loop. SmallVector Successors(MBB->succ_rbegin(), MBB->succ_rend()); // FIXME: Avoid quadratic complexity. for (auto MBBS : Successors) { if (MBBS->isEHPad()) { MBB->removeSuccessor(MBBS); MBBLPads.push_back(MBBS); } } MBB->addSuccessor(DispatchBB); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from // moving instructions to before the EH block, where they will never be // executed. for (auto &II : reverse(*MBB)) { if (!II.isCall()) continue; DenseMap DefRegs; for (auto &MOp : II.operands()) if (MOp.isReg()) DefRegs[MOp.getReg()] = true; MachineInstrBuilder MIB(*MF, &II); for (unsigned RI = 0; SavedRegs[RI]; ++RI) { unsigned Reg = SavedRegs[RI]; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); } break; } } // Mark all former landing pads as non-landing pads. The dispatch is the only // landing pad now. for (auto &LP : MBBLPads) LP->setIsEHPad(false); // The instruction is gone now. MI.eraseFromParent(); return BB; } MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TAILJMPd64: case X86::TAILJMPr64: case X86::TAILJMPm64: case X86::TAILJMPr64_REX: case X86::TAILJMPm64_REX: llvm_unreachable("TAILJMP64 would not be touched here."); case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: return BB; case X86::TLS_addr32: case X86::TLS_addr64: case X86::TLS_base_addr32: case X86::TLS_base_addr64: return EmitLoweredTLSAddr(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::CATCHPAD: return EmitLoweredCatchPad(MI, BB); case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: case X86::CMOV_FR64: case X86::CMOV_FR128: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: case X86::CMOV_V2F64: case X86::CMOV_V2I64: case X86::CMOV_V4F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: case X86::CMOV_V8F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: case X86::CMOV_V8I1: case X86::CMOV_V16I1: case X86::CMOV_V32I1: case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); case X86::RDFLAGS32: case X86::RDFLAGS64: { unsigned PushF = MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF)); // Permit reads of the FLAGS register without it being defined. // This intrinsic exists to read external processor state in flags, such as // the trap flag, interrupt flag, and direction flag, none of which are // modeled by the backend. Push->getOperand(2).setIsUndef(); BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg()); MI.eraseFromParent(); // The pseudo is gone now. return BB; } case X86::WRFLAGS32: case X86::WRFLAGS64: { unsigned Push = MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; unsigned PopF = MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg()); BuildMI(*BB, MI, DL, TII->get(PopF)); MI.eraseFromParent(); // The pseudo is gone now. return BB; } case X86::RELEASE_FADD32mr: case X86::RELEASE_FADD64mr: return EmitLoweredAtomicFP(MI, BB); case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: case X86::FP64_TO_INT16_IN_MEM: case X86::FP64_TO_INT32_IN_MEM: case X86::FP64_TO_INT64_IN_MEM: case X86::FP80_TO_INT16_IN_MEM: case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); // Load the old value of the high byte of the control word... unsigned OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), CWFrameIdx); // Set the high part to be round to zero... addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) .addImm(0xC7F); // Reload the modified control word now... addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); // Restore the memory image of control word to original value addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) .addReg(OldCW); // Get the X86 opcode to use. unsigned Opc; switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; } X86AddressMode AM = getAddressFromInstr(&MI, 0); addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) .addReg(MI.getOperand(X86::AddrNumOperands).getReg()); // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } // String/text processing lowering. case X86::PCMPISTRM128REG: case X86::VPCMPISTRM128REG: case X86::PCMPISTRM128MEM: case X86::VPCMPISTRM128MEM: case X86::PCMPESTRM128REG: case X86::VPCMPESTRM128REG: case X86::PCMPESTRM128MEM: case X86::VPCMPESTRM128MEM: assert(Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: case X86::VPCMPISTRIREG: case X86::PCMPISTRIMEM: case X86::VPCMPISTRIMEM: case X86::PCMPESTRIREG: case X86::VPCMPESTRIREG: case X86::PCMPESTRIMEM: case X86::VPCMPESTRIMEM: assert(Subtarget.hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo()); // Thread synchronization. case X86::MONITOR: return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); case X86::MONITORX: return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr); // Cache line zero case X86::CLZERO: return emitClzero(&MI, BB, Subtarget); // PKU feature case X86::WRPKRU: return emitWRPKRU(MI, BB, Subtarget); case X86::RDPKRU: return emitRDPKRU(MI, BB, Subtarget); // xbegin case X86::XBEGIN: return emitXBegin(MI, BB, Subtarget.getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); case X86::VAARG_64: return EmitVAARG64WithCustomInserter(MI, BB); case X86::EH_SjLj_SetJmp32: case X86::EH_SjLj_SetJmp64: return emitEHSjLjSetJmp(MI, BB); case X86::EH_SjLj_LongJmp32: case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); case X86::Int_eh_sjlj_setup_dispatch: return EmitSjLjDispatchBlock(MI, BB); case TargetOpcode::STATEPOINT: // As an implementation detail, STATEPOINT shares the STACKMAP format at // this point in the process. We diverge later. return emitPatchPoint(MI, BB); case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); case TargetOpcode::PATCHABLE_EVENT_CALL: // Do nothing here, handle in xray instrumentation pass. return BB; case X86::LCMPXCHG8B: { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B // requires a memory operand. If it happens that current architecture is // i686 and for current function we need a base pointer // - which is ESI for i686 - register allocator would not be able to // allocate registers for an address in form of X(%reg, %reg, Y) // - there never would be enough unreserved registers during regalloc // (without the need for base ptr the only option would be X(%edi, %esi, Y). // We are giving a hand to register allocator by precomputing the address in // a new vreg using LEA. // If it is not i686 or there is no base pointer - nothing to do here. if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF)) return BB; // Even though this code does not necessarily needs the base pointer to // be ESI, we check for that. The reason: if this assert fails, there are // some changes happened in the compiler base pointer handling, which most // probably have to be addressed somehow here. assert(TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " "base pointer in mind"); MachineRegisterInfo &MRI = MF->getRegInfo(); MVT SPTy = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass); X86AddressMode AM = getAddressFromInstr(&MI, 0); // Regalloc does not need any help when the memory operand of CMPXCHG8B // does not use index register. if (AM.IndexReg == X86::NoRegister) return BB; // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its // four operand definitions that are E[ABCD] registers. We skip them and // then insert the LEA. MachineBasicBlock::iterator MBBI(MI); while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) || MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX)) --MBBI; addFullAddress( BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM); setDirectAddressInInstr(&MI, 0, computedAddrVReg); return BB; } case X86::LCMPXCHG16B: return BB; case X86::LCMPXCHG8B_SAVE_EBX: case X86::LCMPXCHG16B_SAVE_RBX: { unsigned BasePtr = MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX; if (!BB->isLiveIn(BasePtr)) BB->addLiveIn(BasePtr); return BB; } } } //===----------------------------------------------------------------------===// // X86 Optimization Hooks //===----------------------------------------------------------------------===// void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = Known.getBitWidth(); unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); Known.resetAll(); switch (Opc) { default: break; case X86ISD::SETCC: Known.Zero.setBitsFrom(1); break; case X86ISD::MOVMSK: { unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); Known.Zero.setBitsFrom(NumLoBits); break; } case X86ISD::PEXTRB: case X86ISD::PEXTRW: { SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(), Op.getConstantOperandVal(1)); DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1); Known = Known.zextOrTrunc(BitWidth); Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits()); break; } case X86ISD::VSHLI: case X86ISD::VSRLI: { if (auto *ShiftImm = dyn_cast(Op.getOperand(1))) { if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) { Known.setAllZero(); break; } DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); unsigned ShAmt = ShiftImm->getZExtValue(); if (Opc == X86ISD::VSHLI) { Known.Zero <<= ShAmt; Known.One <<= ShAmt; // Low bits are known zero. Known.Zero.setLowBits(ShAmt); } else { Known.Zero.lshrInPlace(ShAmt); Known.One.lshrInPlace(ShAmt); // High bits are known zero. Known.Zero.setHighBits(ShAmt); } } break; } case X86ISD::VZEXT: { // TODO: Add DemandedElts support. SDValue N0 = Op.getOperand(0); unsigned NumElts = VT.getVectorNumElements(); EVT SrcVT = N0.getValueType(); unsigned InNumElts = SrcVT.getVectorNumElements(); unsigned InBitWidth = SrcVT.getScalarSizeInBits(); assert(InNumElts >= NumElts && "Illegal VZEXT input"); Known = KnownBits(InBitWidth); APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts); DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1); Known = Known.zext(BitWidth); Known.Zero.setBitsFrom(InBitWidth); break; } case X86ISD::CMOV: { DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1); // If we don't know any bits, early out. if (Known.isUnknown()) break; KnownBits Known2; DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1); // Only known if known in both the LHS and RHS. Known.One &= Known2.One; Known.Zero &= Known2.Zero; break; } case X86ISD::UDIVREM8_ZEXT_HREG: // TODO: Support more than just the zero extended bits? if (Op.getResNo() != 1) break; // The remainder is zero extended. Known.Zero.setBitsFrom(8); break; } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { unsigned VTBits = Op.getScalarValueSizeInBits(); unsigned Opcode = Op.getOpcode(); switch (Opcode) { case X86ISD::SETCC_CARRY: // SETCC_CARRY sets the dest to ~0 for true or 0 for false. return VTBits; case X86ISD::VSEXT: { // TODO: Add DemandedElts support. SDValue Src = Op.getOperand(0); unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); Tmp += VTBits - Src.getScalarValueSizeInBits(); return Tmp; } case X86ISD::VTRUNC: { // TODO: Add DemandedElts support. SDValue Src = Op.getOperand(0); unsigned NumSrcBits = Src.getScalarValueSizeInBits(); assert(VTBits < NumSrcBits && "Illegal truncation input type"); unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); if (Tmp > (NumSrcBits - VTBits)) return Tmp - (NumSrcBits - VTBits); return 1; } case X86ISD::PACKSS: { // PACKSS is just a truncation if the sign bits extend to the packed size. // TODO: Add DemandedElts support. unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits(); unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); unsigned Tmp = std::min(Tmp0, Tmp1); if (Tmp > (SrcBits - VTBits)) return Tmp - (SrcBits - VTBits); return 1; } case X86ISD::VSHLI: { SDValue Src = Op.getOperand(0); APInt ShiftVal = cast(Op.getOperand(1))->getAPIntValue(); if (ShiftVal.uge(VTBits)) return VTBits; // Shifted all bits out --> zero. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); if (ShiftVal.uge(Tmp)) return 1; // Shifted all sign bits out --> unknown. return Tmp - ShiftVal.getZExtValue(); } case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); APInt ShiftVal = cast(Op.getOperand(1))->getAPIntValue(); if (ShiftVal.uge(VTBits - 1)) return VTBits; // Sign splat. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); ShiftVal += Tmp; return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue(); } case X86ISD::PCMPGT: case X86ISD::PCMPEQ: case X86ISD::CMPP: case X86ISD::VPCOM: case X86ISD::VPCOMU: // Vector compares return zero/all-bits result values. return VTBits; case X86ISD::CMOV: { unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp0 == 1) return 1; // Early out. unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1); return std::min(Tmp0, Tmp1); } case X86ISD::SDIVREM8_SEXT_HREG: // TODO: Support more than just the sign extended bits? if (Op.getResNo() != 1) break; // The remainder is sign extended. return VTBits - 7; } // Fallback case. return 1; } SDValue X86TargetLowering::unwrapAddress(SDValue N) const { if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP) return N->getOperand(0); return N; } /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const { if (N->getOpcode() == X86ISD::Wrapper) { if (isa(N->getOperand(0))) { GA = cast(N->getOperand(0))->getGlobal(); Offset = cast(N->getOperand(0))->getOffset(); return true; } } return TargetLowering::isGAPlusOffset(N, GA, Offset); } // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { bool Match = true; unsigned NumDstElts = NumMaskElts / Scale; for (unsigned i = 0; i != NumDstElts && Match; ++i) { Match &= isUndefOrEqual(Mask[i * Scale], (int)i); Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); } if (Match) { unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() : MVT::getIntegerVT(MaskEltSize); SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) { V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); Shuffle = unsigned(X86ISD::VZEXT); } else Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); return true; } } } // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && isUndefOrEqual(Mask[0], 0) && isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { Shuffle = X86ISD::VZEXT_MOVL; SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; return true; } // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; } if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v4f32; return true; } } if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v4f64; return true; } if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v8f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v8f32; return true; } } if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v8f64; return true; } if (isTargetShuffleEquivalent( Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v16f32; return true; } if (isTargetShuffleEquivalent( Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v16f32; return true; } } // Attempt to match against broadcast-from-vector. if (Subtarget.hasAVX2()) { SmallVector BroadcastMask(NumMaskElts, 0); if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { SrcVT = DstVT = MaskVT; Shuffle = X86ISD::VBROADCAST; return true; } } return false; } // Attempt to match a combined shuffle mask against supported unary immediate // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); unsigned InputSizeInBits = MaskVT.getSizeInBits(); unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); bool ContainsZeros = llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns. if (!ContainsZeros && MaskScalarSizeInBits == 64) { // Check for lane crossing permutes. if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) { Shuffle = X86ISD::VPERMI; ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); PermuteImm = getV4X86ShuffleImm(Mask); return true; } if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) { SmallVector RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { Shuffle = X86ISD::VPERMI; ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); PermuteImm = getV4X86ShuffleImm(RepeatedMask); return true; } } } else if (AllowFloatDomain && Subtarget.hasAVX()) { // VPERMILPD can permute with a non-repeating shuffle. Shuffle = X86ISD::VPERMILPI; ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); PermuteImm = 0; for (int i = 0, e = Mask.size(); i != e; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); PermuteImm |= (M & 1) << i; } return true; } } // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { // Narrow the repeated mask to create 32-bit element permutes. SmallVector WordMask = RepeatedMask; if (MaskScalarSizeInBits == 64) scaleShuffleMask(2, RepeatedMask, WordMask); Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); PermuteImm = getV4X86ShuffleImm(WordMask); return true; } } // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { ArrayRef LoMask(Mask.data() + 0, 4); ArrayRef HiMask(Mask.data() + 4, 4); // PSHUFLW: permute lower 4 elements only. if (isUndefOrInRange(LoMask, 0, 4) && isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { Shuffle = X86ISD::PSHUFLW; ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); PermuteImm = getV4X86ShuffleImm(LoMask); return true; } // PSHUFHW: permute upper 4 elements only. if (isUndefOrInRange(HiMask, 4, 8) && isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { // Offset the HiMask so that we can create the shuffle immediate. int OffsetHiMask[4]; for (int i = 0; i != 4; ++i) OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); Shuffle = X86ISD::PSHUFHW; ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); PermuteImm = getV4X86ShuffleImm(OffsetHiMask); return true; } } } // Attempt to match against byte/bit shifts. // FIXME: Add 512-bit support. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0, Zeroable, Subtarget); if (0 < ShiftAmt) { PermuteImm = (unsigned)ShiftAmt; return true; } } return false; } // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVLHPS; SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVHLPS; SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; SrcVT = DstVT = MaskVT; return true; } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; SrcVT = DstVT = MaskVT; return true; } } // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle. // TODO add support for 256/512-bit types. if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) { if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, Subtarget)) { DstVT = MaskVT; return true; } } // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle. if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || (MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG, Subtarget)) { SrcVT = DstVT = MaskVT; if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); return true; } } return false; } static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); // Attempt to match against PALIGNR byte rotate. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8); PermuteImm = ByteRotation; return true; } } // Attempt to combine to X86ISD::BLENDI. if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || (Subtarget.hasAVX() && MaskVT.is256BitVector()))) || (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) { uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector TargetMask(Mask.begin(), Mask.end()); if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero, BlendMask)) { if (MaskVT == MVT::v16i16) { // We can only use v16i16 PBLENDW if the lanes are repeated. SmallVector RepeatedMask; if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask, RepeatedMask)) { assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); PermuteImm = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) PermuteImm |= 1 << i; V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::BLENDI; ShuffleVT = MaskVT; return true; } } else { // Determine a type compatible with X86ISD::BLENDI. ShuffleVT = MaskVT; if (Subtarget.hasAVX2()) { if (ShuffleVT == MVT::v4i64) ShuffleVT = MVT::v8i32; else if (ShuffleVT == MVT::v2i64) ShuffleVT = MVT::v4i32; } else { if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) ShuffleVT = MVT::v8i16; else if (ShuffleVT == MVT::v4i64) ShuffleVT = MVT::v4f64; else if (ShuffleVT == MVT::v8i32) ShuffleVT = MVT::v8f32; } if (!ShuffleVT.isFloatingPoint()) { int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); BlendMask = scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); } V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; return true; } } } // Attempt to combine to INSERTPS. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector()) { if (Zeroable.getBoolValue() && matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; return true; } } // Attempt to combine to SHUFPD. if (AllowFloatDomain && EltSizeInBits == 64 && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; } } // Attempt to combine to SHUFPS. if (AllowFloatDomain && EltSizeInBits == 32 && ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { SmallVector RepeatedMask; if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { // Match each half of the repeated mask, to determine if its just // referencing one of the vectors, is zeroable or entirely undef. auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { int M0 = RepeatedMask[Offset]; int M1 = RepeatedMask[Offset + 1]; if (isUndefInRange(RepeatedMask, Offset, 2)) { return DAG.getUNDEF(MaskVT); } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) { S0 = (SM_SentinelUndef == M0 ? -1 : 0); S1 = (SM_SentinelUndef == M1 ? -1 : 1); return getZeroVector(MaskVT, Subtarget, DAG, DL); } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) { S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); return V1; } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) { S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); return V2; } return SDValue(); }; int ShufMask[4] = {-1, -1, -1, -1}; SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]); SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]); if (Lo && Hi) { V1 = Lo; V2 = Hi; Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32); PermuteImm = getV4X86ShuffleImm(ShufMask); return true; } } } return false; } /// \brief Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// /// This is the leaf of the recursive combine below. When we have found some /// chain of single-use x86 shuffle instructions and accumulated the combined /// shuffle mask represented by them, this will try to pattern match that mask /// into either a single instruction if there is a special purpose instruction /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); assert((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"); // Find the inputs that enter the chain. Note that multiple uses are OK // here, we're not going to remove the operands we find. bool UnaryShuffle = (Inputs.size() == 1); SDValue V1 = peekThroughBitcasts(Inputs[0]); SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType()) : peekThroughBitcasts(Inputs[1])); MVT VT1 = V1.getSimpleValueType(); MVT VT2 = V2.getSimpleValueType(); MVT RootVT = Root.getSimpleValueType(); assert(VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"); SDLoc DL(Root); SDValue Res; unsigned NumBaseMaskElts = BaseMask.size(); if (NumBaseMaskElts == 1) { assert(BaseMask[0] == 0 && "Invalid shuffle index found!"); return DAG.getBitcast(RootVT, V1); } unsigned RootSizeInBits = RootVT.getSizeInBits(); unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() || (RootVT.is256BitVector() && !Subtarget.hasAVX2()); // Don't combine if we are a AVX512/EVEX target and the mask element size // is different from the root element size - this would prevent writemasks // from being reused. // TODO - this currently prevents all lane shuffles from occurring. // TODO - check for writemasks usage instead of always preventing combining. // TODO - attempt to narrow Mask back to writemask size. bool IsEVEXShuffle = RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. // Handle 128-bit lane shuffles of 256-bit vectors. // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless // we need to use the zeroing feature. // TODO - this should support binary shuffles. if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 && !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) && !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); unsigned PermMask = 0; PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); Res = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, DAG.getUNDEF(ShuffleVT), DAG.getConstant(PermMask, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } // For masks that have been widened to 128-bit elements or more, // narrow back down to 64-bit elements. SmallVector Mask; if (BaseMaskEltSizeInBits > 64) { assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"); int MaskScale = BaseMaskEltSizeInBits / 64; scaleShuffleMask(MaskScale, BaseMask, Mask); } else { Mask = SmallVector(BaseMask.begin(), BaseMask.end()); } unsigned NumMaskElts = Mask.size(); unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts; // Determine the effective mask value type. FloatDomain &= (32 <= MaskEltSizeInBits); MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits) : MVT::getIntegerVT(MaskEltSizeInBits); MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts); // Only allow legal mask types. if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) return SDValue(); // Attempt to match the mask against known shuffle patterns. MVT ShuffleSrcVT, ShuffleVT; unsigned Shuffle, PermuteImm; // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. bool AllowFloatDomain = FloatDomain || (Depth > 3); bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() && (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); // Determine zeroable mask elements. APInt Zeroable(NumMaskElts, 0); for (unsigned i = 0; i != NumMaskElts; ++i) if (isUndefOrZero(Mask[i])) Zeroable.setBit(i); if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load // directly if we don't shuffle the lower element and we shuffle the upper // (zero) elements within themselves. if (V1.getOpcode() == X86ISD::VZEXT_LOAD && (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) { unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits; ArrayRef HiMask(Mask.data() + Scale, NumMaskElts - Scale); if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { return DAG.getBitcast(RootVT, V1); } } if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleSrcVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, DAG.getConstant(PermuteImm, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } } if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT, UnaryShuffle) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(ShuffleSrcVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(ShuffleSrcVT, V2); DCI.AddToWorklist(V2.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(ShuffleVT, V2); DCI.AddToWorklist(V2.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2, DAG.getConstant(PermuteImm, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } // Typically from here on, we need an integer version of MaskVT. MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits); IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts); // Annoyingly, SSE4A instructions don't map into the above match helpers. if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { uint64_t BitLen, BitIdx; if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) { if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); DCI.AddToWorklist(V1.getNode()); Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(IntMaskVT, V2); DCI.AddToWorklist(V2.getNode()); Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } } // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) return SDValue(); // Depth threshold above which we can efficiently use variable mask shuffles. int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3; bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask; bool MaskContainsZeros = any_of(Mask, [](int M) { return M == SM_SentinelZero; }); if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX2() && (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero // vector as the second source. if (UnaryShuffle && AllowVariableMask && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasVLX() && (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { // Adjust shuffle mask - replace SM_SentinelZero with second source index. for (unsigned i = 0; i != NumMaskElts; ++i) if (Mask[i] == SM_SentinelZero) Mask[i] = NumMaskElts + i; SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL); DCI.AddToWorklist(Zero.getNode()); Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } // If we have a dual input lane-crossing shuffle then lower to VPERMV3. if (AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasVLX() && (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); V1 = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(MaskVT, V2); DCI.AddToWorklist(V2.getNode()); Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } return SDValue(); } // See if we can combine a single input shuffle with zeros to a bit-mask, // which is much simpler than any shuffle. if (UnaryShuffle && MaskContainsZeros && AllowVariableMask && isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) && DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { APInt Zero = APInt::getNullValue(MaskEltSizeInBits); APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits); APInt UndefElts(NumMaskElts, 0); SmallVector EltBits(NumMaskElts, Zero); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { UndefElts.setBit(i); continue; } if (M == SM_SentinelZero) continue; EltBits[i] = AllOnes; } SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL); DCI.AddToWorklist(BitMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); unsigned AndOpcode = FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND); Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } // If we have a single input shuffle with different shuffle patterns in the // the 128-bit lanes use the variable mask to VPERMILPS. // TODO Combine other mask types at higher depths. if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { SmallVector VPermIdx; for (int M : Mask) { SDValue Idx = M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); VPermIdx.push_back(Idx); } SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } // With XOP, binary shuffles of 128/256-bit floating point vectors can combine // to VPERMIL2PD/VPERMIL2PS. if (AllowVariableMask && Subtarget.hasXOP() && (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v8f32)) { // VPERMIL2 Operation. // Bits[3] - Match Bit. // Bits[2:1] - (Per Lane) PD Shuffle Mask. // Bits[2:0] - (Per Lane) PS Shuffle Mask. unsigned NumLanes = MaskVT.getSizeInBits() / 128; unsigned NumEltsPerLane = NumMaskElts / NumLanes; SmallVector VPerm2Idx; unsigned M2ZImm = 0; for (int M : Mask) { if (M == SM_SentinelUndef) { VPerm2Idx.push_back(-1); continue; } if (M == SM_SentinelZero) { M2ZImm = 2; VPerm2Idx.push_back(8); continue; } int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane); Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index); VPerm2Idx.push_back(Index); } V1 = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(MaskVT, V2); DCI.AddToWorklist(V2.getNode()); SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPerm2MaskOp.getNode()); Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, DAG.getConstant(M2ZImm, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } // If we have 3 or more shuffle instructions or a chain involving a variable // mask, we can replace them with a single PSHUFB instruction profitably. // Intel's manuals suggest only using PSHUFB if doing so replacing 5 // instructions, but in practice PSHUFB tends to be *very* fast so we're // more aggressive. if (UnaryShuffle && AllowVariableMask && ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) || (RootVT.is256BitVector() && Subtarget.hasAVX2()) || (RootVT.is512BitVector() && Subtarget.hasBWI()))) { SmallVector PSHUFBMask; int NumBytes = RootVT.getSizeInBits() / 8; int Ratio = NumBytes / NumMaskElts; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Ratio]; if (M == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } if (M == SM_SentinelZero) { PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8)); continue; } M = Ratio * M + i % Ratio; assert((M / 16) == (i / 16) && "Lane crossing detected"); PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); Res = DAG.getBitcast(ByteVT, V1); DCI.AddToWorklist(Res.getNode()); SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask); DCI.AddToWorklist(PSHUFBMaskOp.getNode()); Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } // With XOP, if we have a 128-bit binary input shuffle we can always combine // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never // slower than PSHUFB on targets that support both. if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) { // VPPERM Mask Operation // Bits[4:0] - Byte Index (0 - 31) // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO) SmallVector VPPERMMask; int NumBytes = 16; int Ratio = NumBytes / NumMaskElts; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Ratio]; if (M == SM_SentinelUndef) { VPPERMMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } if (M == SM_SentinelZero) { VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8)); continue; } M = Ratio * M + i % Ratio; VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::v16i8; V1 = DAG.getBitcast(ByteVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(ByteVT, V2); DCI.AddToWorklist(V2.getNode()); SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask); DCI.AddToWorklist(VPPERMMaskOp.getNode()); Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp); DCI.AddToWorklist(Res.getNode()); return DAG.getBitcast(RootVT, Res); } // Failed to find any combines. return SDValue(); } // Attempt to constant fold all of the constant source ops. // Returns true if the entire shuffle is folded to a constant. // TODO: Extend this to merge multiple constant Ops and update the mask. static SDValue combineX86ShufflesConstants(const SmallVectorImpl &Ops, ArrayRef Mask, SDValue Root, bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MVT VT = Root.getSimpleValueType(); unsigned SizeInBits = VT.getSizeInBits(); unsigned NumMaskElts = Mask.size(); unsigned MaskSizeInBits = SizeInBits / NumMaskElts; unsigned NumOps = Ops.size(); // Extract constant bits from each source op. bool OneUseConstantOp = false; SmallVector UndefEltsOps(NumOps); SmallVector, 16> RawBitsOps(NumOps); for (unsigned i = 0; i != NumOps; ++i) { SDValue SrcOp = Ops[i]; OneUseConstantOp |= SrcOp.hasOneUse(); if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i], RawBitsOps[i])) return SDValue(); } // Only fold if at least one of the constants is only used once or // the combined shuffle has included a variable mask shuffle, this // is to avoid constant pool bloat. if (!OneUseConstantOp && !HasVariableMask) return SDValue(); // Shuffle the constant bits according to the mask. APInt UndefElts(NumMaskElts, 0); APInt ZeroElts(NumMaskElts, 0); APInt ConstantElts(NumMaskElts, 0); SmallVector ConstantBitData(NumMaskElts, APInt::getNullValue(MaskSizeInBits)); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { UndefElts.setBit(i); continue; } else if (M == SM_SentinelZero) { ZeroElts.setBit(i); continue; } assert(0 <= M && M < (int)(NumMaskElts * NumOps)); unsigned SrcOpIdx = (unsigned)M / NumMaskElts; unsigned SrcMaskIdx = (unsigned)M % NumMaskElts; auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; if (SrcUndefElts[SrcMaskIdx]) { UndefElts.setBit(i); continue; } auto &SrcEltBits = RawBitsOps[SrcOpIdx]; APInt &Bits = SrcEltBits[SrcMaskIdx]; if (!Bits) { ZeroElts.setBit(i); continue; } ConstantElts.setBit(i); ConstantBitData[i] = Bits; } assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()); // Create the constant data. MVT MaskSVT; if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64)) MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits); else MaskSVT = MVT::getIntegerVT(MaskSizeInBits); MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts); SDLoc DL(Root); SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL); DCI.AddToWorklist(CstOp.getNode()); return DAG.getBitcast(VT, CstOp); } /// \brief Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once /// they have been fully optimized, this will recursively consider all chains /// of single-use shuffle instructions, build a generic model of the cumulative /// shuffle operation, and check for simpler instructions which implement this /// operation. We use this primarily for two purposes: /// /// 1) Collapse generic shuffles to specialized single instructions when /// equivalent. In most cases, this is just an encoding size win, but /// sometimes we will collapse multiple generic shuffles into a single /// special-purpose shuffle. /// 2) Look for sequences of shuffle instructions with 3 or more total /// instructions, and replace them with the slightly more expensive SSSE3 /// PSHUFB instruction if available. We do this as the last combining step /// to ensure we avoid using PSHUFB if we can implement the shuffle with /// a suitable short sequence of other instructions. The PSHUFB will either /// use a register or have to read from memory and so is slightly (but only /// slightly) more expensive than the other shuffle instructions. /// /// Because this is inherently a quadratic operation (for each shuffle in /// a chain, we recurse up the chain), the depth is limited to 8 instructions. /// This should never be an issue in practice as the shuffle lowering doesn't /// produce sequences of more than 8 instructions. /// /// FIXME: We will currently miss some cases where the redundant shuffling /// would simplify under the threshold for PSHUFB formation because of /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, int Depth, bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. if (Depth > 8) return SDValue(); // Directly rip through bitcasts to find the underlying operand. SDValue Op = SrcOps[SrcOpIndex]; Op = peekThroughOneUseBitcasts(Op); MVT VT = Op.getSimpleValueType(); if (!VT.isVector()) return SDValue(); // Bail if we hit a non-vector. assert(Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"); assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. SmallVector OpMask; SmallVector OpInputs; if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) return SDValue(); assert(OpInputs.size() <= 2 && "Too many shuffle inputs"); SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue()); SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue()); // Add the inputs to the Ops list, avoiding duplicates. SmallVector Ops(SrcOps.begin(), SrcOps.end()); int InputIdx0 = -1, InputIdx1 = -1; for (int i = 0, e = Ops.size(); i < e; ++i) { SDValue BC = peekThroughBitcasts(Ops[i]); if (Input0 && BC == peekThroughBitcasts(Input0)) InputIdx0 = i; if (Input1 && BC == peekThroughBitcasts(Input1)) InputIdx1 = i; } if (Input0 && InputIdx0 < 0) { InputIdx0 = SrcOpIndex; Ops[SrcOpIndex] = Input0; } if (Input1 && InputIdx1 < 0) { InputIdx1 = Ops.size(); Ops.push_back(Input1); } assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger."); // This function can be performance-critical, so we rely on the power-of-2 // knowledge that we have about the mask sizes to replace div/rem ops with // bit-masks and shifts. assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"); assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"); unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size()); unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size()); unsigned MaskWidth = std::max(OpMask.size(), RootMask.size()); unsigned RootRatio = std::max(1, OpMask.size() >> RootMaskSizeLog2); unsigned OpRatio = std::max(1, RootMask.size() >> OpMaskSizeLog2); assert((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"); assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"); assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"); assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"); unsigned RootRatioLog2 = countTrailingZeros(RootRatio); unsigned OpRatioLog2 = countTrailingZeros(OpRatio); SmallVector Mask(MaskWidth, SM_SentinelUndef); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by the // root mask to get us all the way to the root value arrangement. The reason // for this order is that we are recursing up the operation chain. for (unsigned i = 0; i < MaskWidth; ++i) { unsigned RootIdx = i >> RootRatioLog2; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. Mask[i] = RootMask[RootIdx]; continue; } unsigned RootMaskedIdx = RootRatio == 1 ? RootMask[RootIdx] : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1)); // Just insert the scaled root mask value if it references an input other // than the SrcOp we're currently inserting. if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { Mask[i] = RootMaskedIdx; continue; } RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we // are using. Mask[i] = OpMask[OpIdx]; continue; } // Ok, we have non-zero lanes, map them through to one of the Op's inputs. unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx] : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1)); OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); if (OpMask[OpIdx] < (int)OpMask.size()) { assert(0 <= InputIdx0 && "Unknown target shuffle input"); OpMaskedIdx += InputIdx0 * MaskWidth; } else { assert(0 <= InputIdx1 && "Unknown target shuffle input"); OpMaskedIdx += InputIdx1 * MaskWidth; } Mask[i] = OpMaskedIdx; } // Handle the all undef/zero cases early. if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) return DAG.getUNDEF(Root.getValueType()); // TODO - should we handle the mixed zero/undef case as well? Just returning // a zero mask will lose information on undef elements possibly reducing // future combine possibilities. if (all_of(Mask, [](int Idx) { return Idx < 0; })) return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, SDLoc(Root)); // Remove unused shuffle source ops. resolveTargetShuffleInputsAndMask(Ops, Mask); assert(!Ops.empty() && "Shuffle with no inputs detected"); HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); // Update the list of shuffle nodes that have been combined so far. SmallVector CombinedNodes(SrcNodes.begin(), SrcNodes.end()); CombinedNodes.push_back(Op.getNode()); // See if we can recurse into each shuffle source op (if it's a target // shuffle). The source op should only be combined if it either has a // single use (i.e. current Op) or all its users have already been combined. for (int i = 0, e = Ops.size(); i < e; ++i) if (Ops[i].getNode()->hasOneUse() || SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) if (SDValue Res = combineX86ShufflesRecursively( Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask, DAG, DCI, Subtarget)) return Res; // Attempt to constant fold all of the constant source ops. if (SDValue Cst = combineX86ShufflesConstants( Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget)) return Cst; // We can only combine unary and binary shuffle mask cases. if (Ops.size() > 2) return SDValue(); // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with sequential pairs of // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. SmallVector WidenedMask; while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { Mask = std::move(WidenedMask); } // Canonicalization of binary shuffle masks to improve pattern matching by // commuting the inputs. if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(Ops[0], Ops[1]); } // Finally, try to combine into a single shuffle instruction. return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG, DCI, Subtarget); } /// \brief Get the PSHUF-style mask from PSHUF node. /// /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 /// PSHUF-style masks that can be reused with such instructions. static SmallVector getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector Mask; SmallVector Ops; bool IsUnary; bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary); (void)HaveMask; assert(HaveMask); // If we have more than 128-bits, only the low 128-bits of shuffle mask // matter. Check that the upper masks are repeats and remove them. if (VT.getSizeInBits() > 128) { int LaneElts = 128 / VT.getScalarSizeInBits(); #ifndef NDEBUG for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) for (int j = 0; j < LaneElts; ++j) assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"); #endif Mask.resize(LaneElts); } switch (N.getOpcode()) { case X86ISD::PSHUFD: return Mask; case X86ISD::PSHUFLW: Mask.resize(4); return Mask; case X86ISD::PSHUFHW: Mask.erase(Mask.begin(), Mask.begin() + 4); for (int &M : Mask) M -= 4; return Mask; default: llvm_unreachable("No valid shuffle instruction found!"); } } /// \brief Search for a combinable shuffle across a chain ending in pshufd. /// /// We walk up the chain and look for a combinable shuffle, skipping over /// shuffles that we could hoist this shuffle's transformation past without /// altering anything. static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, SelectionDAG &DAG) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); // Walk up a single-use chain looking for a combinable shuffle. Keep a stack // of the shuffles in the chain so that we can form a fresh chain to replace // this one. SmallVector Chain; SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: return SDValue(); // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific // instructions. continue; case X86ISD::PSHUFD: // Found another dword shuffle. break; case X86ISD::PSHUFLW: // Check that the low words (being shuffled) are the identity in the // dword shuffle, and the high words are self-contained. if (Mask[0] != 0 || Mask[1] != 1 || !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) return SDValue(); Chain.push_back(V); continue; case X86ISD::PSHUFHW: // Check that the high words (being shuffled) are the identity in the // dword shuffle, and the low words are self-contained. if (Mask[2] != 2 || Mask[3] != 3 || !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) return SDValue(); Chain.push_back(V); continue; case X86ISD::UNPCKL: case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && V.getSimpleValueType().getVectorElementType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. unsigned CombineOp = V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; if (V.getOperand(0) != V.getOperand(1) || !V->isOnlyUserOf(V.getOperand(0).getNode())) return SDValue(); Chain.push_back(V); V = V.getOperand(0); do { switch (V.getOpcode()) { default: return SDValue(); // Nothing to combine. case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOp) break; Chain.push_back(V); LLVM_FALLTHROUGH; case ISD::BITCAST: V = V.getOperand(0); continue; } break; } while (V.hasOneUse()); break; } // Break out of the loop if we break out of the switch. break; } if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. return SDValue(); // Merge this node's mask and our incoming mask. SmallVector VMask = getPSHUFShuffleMask(V); for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Rebuild the chain around this new shuffle. while (!Chain.empty()) { SDValue W = Chain.pop_back_val(); if (V.getValueType() != W.getOperand(0).getValueType()) V = DAG.getBitcast(W.getOperand(0).getValueType(), V); switch (W.getOpcode()) { default: llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); case X86ISD::UNPCKL: case X86ISD::UNPCKH: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); break; case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); break; } } if (V.getValueType() != N.getValueType()) V = DAG.getBitcast(N.getValueType(), V); // Return the new chain to replace N. return V; } /// \brief Search for a combinable shuffle across a chain ending in pshuflw or /// pshufhw. /// /// We walk up the chain, skipping shuffles of the other half and looking /// through shuffles which switch halves trying to find a shuffle of the same /// pair of dwords. static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef Mask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert( (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); unsigned CombineOpcode = N.getOpcode(); // Walk up a single-use chain looking for a combinable shuffle. SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: return false; // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific // instructions. continue; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOpcode) break; // Other-half shuffles are no-ops. continue; } // Break out of the loop if we break out of the switch. break; } if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. return false; // Combine away the bottom node as its shuffle will be accumulated into // a preceding shuffle. DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); // Record the old value. SDValue Old = V; // Merge this node's mask and our incoming mask (adjusted to account for all // the pshufd instructions encountered). SmallVector VMask = getPSHUFShuffleMask(V); for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Check that the shuffles didn't cancel each other out. If not, we need to // combine to the new one. if (Old != V) // Replace the combinable shuffle with the combined one, updating all users // so that we re-evaluate the chain here. DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); return true; } /// \brief Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); MVT VT = N.getSimpleValueType(); SmallVector Mask; unsigned Opcode = N.getOpcode(); // Combine binary shuffle of 2 similar 'Horizontal' instructions into a // single instruction. if (VT.getScalarSizeInBits() == 64 && (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH || Opcode == X86ISD::UNPCKL)) { auto BC0 = peekThroughBitcasts(N.getOperand(0)); auto BC1 = peekThroughBitcasts(N.getOperand(1)); EVT VT0 = BC0.getValueType(); EVT VT1 = BC1.getValueType(); unsigned Opcode0 = BC0.getOpcode(); unsigned Opcode1 = BC1.getOpcode(); if (Opcode0 == Opcode1 && VT0 == VT1 && (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) { SDValue Lo, Hi; if (Opcode == X86ISD::MOVSD) { Lo = BC1.getOperand(0); Hi = BC0.getOperand(1); } else { Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0); Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0); } SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi); DCI.AddToWorklist(Horiz.getNode()); return DAG.getBitcast(VT, Horiz); } } switch (Opcode) { case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: Mask = getPSHUFShuffleMask(N); assert(Mask.size() == 4); break; case X86ISD::UNPCKL: { // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE // moves upper half elements into the lower half part. For example: // // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, // undef:v16i8 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2 // // will be combined to: // // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not // happen due to advanced instructions. if (!VT.is128BitVector()) return SDValue(); auto Op0 = N.getOperand(0); auto Op1 = N.getOperand(1); if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) { ArrayRef Mask = cast(Op1.getNode())->getMask(); unsigned NumElts = VT.getVectorNumElements(); SmallVector ExpectedMask(NumElts, -1); std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2, NumElts / 2); auto ShufOp = Op1.getOperand(0); if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp); } return SDValue(); } case X86ISD::BLENDI: { SDValue V0 = N->getOperand(0); SDValue V1 = N->getOperand(1); assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && "Unexpected input vector types"); // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector // operands and changing the mask to 1. This saves us a bunch of // pattern-matching possibilities related to scalar math ops in SSE/AVX. // x86InstrInfo knows how to commute this back after instruction selection // if it would help register allocation. // TODO: If optimizing for size or a processor that doesn't suffer from // partial register update stalls, this should be transformed into a MOVSD // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. if (VT == MVT::v2f64) if (auto *Mask = dyn_cast(N->getOperand(2))) if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); } return SDValue(); } case X86ISD::MOVSD: case X86ISD::MOVSS: { SDValue V0 = peekThroughBitcasts(N->getOperand(0)); SDValue V1 = peekThroughBitcasts(N->getOperand(1)); bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode()); bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode()); if (isZero0 && isZero1) return SDValue(); // We often lower to MOVSD/MOVSS from integer as well as native float // types; remove unnecessary domain-crossing bitcasts if we can to make it // easier to combine shuffles later on. We've already accounted for the // domain switching cost when we decided to lower with it. bool isFloat = VT.isFloatingPoint(); bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) { MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32) : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32); V0 = DAG.getBitcast(NewVT, V0); V1 = DAG.getBitcast(NewVT, V1); return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1)); } return SDValue(); } case X86ISD::INSERTPS: { assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); SDValue Op0 = N.getOperand(0); SDValue Op1 = N.getOperand(1); SDValue Op2 = N.getOperand(2); unsigned InsertPSMask = cast(Op2)->getZExtValue(); unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; unsigned DstIdx = (InsertPSMask >> 4) & 0x3; unsigned ZeroMask = InsertPSMask & 0xF; // If we zero out all elements from Op0 then we don't need to reference it. if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1, DAG.getConstant(InsertPSMask, DL, MVT::i8)); // If we zero out the element from Op1 then we don't need to reference it. if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getConstant(InsertPSMask, DL, MVT::i8)); // Attempt to merge insertps Op1 with an inner target shuffle node. SmallVector TargetMask1; SmallVector Ops1; if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) { int M = TargetMask1[SrcIdx]; if (isUndefOrZero(M)) { // Zero/UNDEF insertion - zero out element and remove dependency. InsertPSMask |= (1u << DstIdx); return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getConstant(InsertPSMask, DL, MVT::i8)); } // Update insertps mask srcidx and reference the source input directly. assert(0 <= M && M < 8 && "Shuffle index out of range"); InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); Op1 = Ops1[M < 4 ? 0 : 1]; return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, DAG.getConstant(InsertPSMask, DL, MVT::i8)); } // Attempt to merge insertps Op0 with an inner target shuffle node. SmallVector TargetMask0; SmallVector Ops0; if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) return SDValue(); bool Updated = false; bool UseInput00 = false; bool UseInput01 = false; for (int i = 0; i != 4; ++i) { int M = TargetMask0[i]; if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { // No change if element is already zero or the inserted element. continue; } else if (isUndefOrZero(M)) { // If the target mask is undef/zero then we must zero the element. InsertPSMask |= (1u << i); Updated = true; continue; } // The input vector element must be inline. if (M != i && M != (i + 4)) return SDValue(); // Determine which inputs of the target shuffle we're using. UseInput00 |= (0 <= M && M < 4); UseInput01 |= (4 <= M); } // If we're not using both inputs of the target shuffle then use the // referenced input directly. if (UseInput00 && !UseInput01) { Updated = true; Op0 = Ops0[0]; } else if (!UseInput00 && UseInput01) { Updated = true; Op0 = Ops0[1]; } if (Updated) return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, DAG.getConstant(InsertPSMask, DL, MVT::i8)); return SDValue(); } default: return SDValue(); } // Nuke no-op shuffles that show up after combining. if (isNoopShuffleMask(Mask)) return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); // Look for simplifications involving one or two shuffle instructions. SDValue V = N.getOperand(0); switch (N.getOpcode()) { default: break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. // See if this reduces to a PSHUFD which is no more expensive and can // combine with more operations. Note that it has to at least flip the // dwords as otherwise it would have been removed as a no-op. if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); V = DAG.getBitcast(DVT, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); DCI.AddToWorklist(V.getNode()); return DAG.getBitcast(VT, V); } // Look for shuffle patterns which can be implemented as a single unpack. // FIXME: This doesn't handle the location of the PSHUFD generically, and // only works when we have a PSHUFD followed by two half-shuffles. if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && (V.getOpcode() == X86ISD::PSHUFLW || V.getOpcode() == X86ISD::PSHUFHW) && V.getOpcode() != N.getOpcode() && V.hasOneUse()) { SDValue D = peekThroughOneUseBitcasts(V.getOperand(0)); if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { SmallVector VMask = getPSHUFShuffleMask(V); SmallVector DMask = getPSHUFShuffleMask(D); int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int WordMask[8]; for (int i = 0; i < 4; ++i) { WordMask[i + NOffset] = Mask[i] + NOffset; WordMask[i + VOffset] = VMask[i] + VOffset; } // Map the word mask through the DWord mask. int MappedMask[8]; for (int i = 0; i < 8; ++i) MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { // We can replace all three shuffles with an unpack. V = DAG.getBitcast(VT, D.getOperand(0)); DCI.AddToWorklist(V.getNode()); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, VT, V, V); } } } break; case X86ISD::PSHUFD: if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG)) return NewN; break; } return SDValue(); } /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD) /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation /// are written to the parameters \p Opnd0 and \p Opnd1. /// /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes /// so it is easier to generically match. We also insert dummy vector shuffle /// nodes for the operands which explicitly discard the lanes which are unused /// by this operation to try to flow through the rest of the combiner /// the fact that they're unused. static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, SDValue &Opnd0, SDValue &Opnd1, bool matchSubAdd = false) { EVT VT = N->getValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) return false; // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask // extraction tool to support more. if (N->getOpcode() != ISD::VECTOR_SHUFFLE) return false; ArrayRef OrigMask = cast(N)->getMask(); SmallVector Mask(OrigMask.begin(), OrigMask.end()); SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB; unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD; // We require the first shuffle operand to be the ExpectedOpcode node, // and the second to be the NextExpectedOpcode node. if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode) return false; // If there are other uses of these operations we can't fold them. if (!V1->hasOneUse() || !V2->hasOneUse()) return false; // Ensure that both operations have the same operands. Note that we can // commute the FADD operands. SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) return false; // We're looking for blends between FADD and FSUB nodes. We insist on these // nodes being lined up in a specific expected pattern. if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) || isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31}))) return false; Opnd0 = LHS; Opnd1 = RHS; return true; } /// \brief Try to combine a shuffle into a target-specific add-sub or /// mul-add-sub node. static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1)) return SDValue(); EVT VT = N->getValueType(0); SDLoc DL(N); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); // Do not generate X86ISD::ADDSUB node for 512-bit types even though // the ADDSUB idiom has been successfully recognized. There are no known // X86 targets with 512-bit ADDSUB instructions! if (VT.is512BitVector()) return SDValue(); return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } /// \brief Try to combine a shuffle into a target-specific /// mul-sub-add node. static SDValue combineShuffleToFMSubAdd(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true)) return SDValue(); EVT VT = N->getValueType(0); SDLoc DL(N); // Try to generate X86ISD::FMSUBADD node here. SDValue Opnd2; if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2); return SDValue(); } // We are looking for a shuffle where both sources are concatenated with undef // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so // if we can express this as a single-source shuffle, that's preferable. static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX2() || !isa(N)) return SDValue(); EVT VT = N->getValueType(0); // We only care about shuffles of 128/256-bit vectors of 32/64-bit values. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); if (VT.getVectorElementType() != MVT::i32 && VT.getVectorElementType() != MVT::i64 && VT.getVectorElementType() != MVT::f32 && VT.getVectorElementType() != MVT::f64) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Check that both sources are concats with undef. if (N0.getOpcode() != ISD::CONCAT_VECTORS || N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef()) return SDValue(); // Construct the new shuffle mask. Elements from the first source retain their // index, but elements from the second source no longer need to skip an undef. SmallVector Mask; int NumElts = VT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast(N); for (int Elt : SVOp->getMask()) Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2)); SDLoc DL(N); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0), N1.getOperand(0)); return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask); } /// Eliminate a redundant shuffle of a horizontal math op. static SDValue foldShuffleOfHorizOp(SDNode *N) { if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) return SDValue(); SDValue HOp = N->getOperand(0); if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD && HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB) return SDValue(); // 128-bit horizontal math instructions are defined to operate on adjacent // lanes of each operand as: // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3] // ...similarly for v2f64 and v8i16. // TODO: 256-bit is not the same because...x86. if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128) return SDValue(); // When the operands of a horizontal math op are identical, the low half of // the result is the same as the high half. If the shuffle is also replicating // low and high halves, we don't need the shuffle. // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X ArrayRef Mask = cast(N)->getMask(); // TODO: Other mask possibilities like {1,1} and {1,0} could be added here, // but this should be tied to whatever horizontal op matching and shuffle // canonicalization are producing. if (isTargetShuffleEquivalent(Mask, { 0, 0 }) || isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) || isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 })) return HOp; return SDValue(); } static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG)) return FMSubAdd; if (SDValue HAddSub = foldShuffleOfHorizOp(N)) return HAddSub; } // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. // // This code performs the following transformation: // fold: (shuffle (bitcast (BINOP A, B)), Undef, ) -> // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, ) // // We do this only if both the bitcast and the BINOP dag nodes have // one use. Also, perform this transformation only if the new binary // operation is legal. This is to avoid introducing dag nodes that // potentially need to be further expanded (or custom lowered) into a // less optimal sequence of dag nodes. if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::VECTOR_SHUFFLE && N->getOperand(0).getOpcode() == ISD::BITCAST && N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue BC0 = N0.getOperand(0); EVT SVT = BC0.getValueType(); unsigned Opcode = BC0.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); if (BC0.hasOneUse() && SVT.isVector() && SVT.getVectorNumElements() * 2 == NumElts && TLI.isOperationLegal(Opcode, VT)) { bool CanFold = false; switch (Opcode) { default : break; case ISD::ADD: case ISD::SUB: case ISD::MUL: // isOperationLegal lies for integer ops on floating point types. CanFold = VT.isInteger(); break; case ISD::FADD: case ISD::FSUB: case ISD::FMUL: // isOperationLegal lies for floating point ops on integer types. CanFold = VT.isFloatingPoint(); break; } unsigned SVTNumElts = SVT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast(N); for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) CanFold = SVOp->getMaskElt(i) == (int)(i * 2); for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) CanFold = SVOp->getMaskElt(i) < 0; if (CanFold) { SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask()); } } } // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. SmallVector Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { Elts.push_back(Elt); continue; } Elts.clear(); break; } if (Elts.size() == VT.getVectorNumElements()) if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true)) return LD; // For AVX2, we sometimes want to combine // (vector_shuffle (concat_vectors t1, undef) // (concat_vectors t2, undef)) // Into: // (vector_shuffle (concat_vectors t1, t2), undef) // Since the latter can be efficiently lowered with VPERMD/VPERMQ if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget)) return ShufConcat; if (isTargetShuffle(N->getOpcode())) { SDValue Op(N, 0); if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget)) return Shuffle; // Try recursively combining arbitrary sequences of x86 shuffle // instructions into higher-order shuffles. We do this after combining // specific PSHUF instruction sequences into their minimal form so that we // can evaluate how many specialized shuffle instructions are involved in // a particular chain. if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) { DCI.CombineTo(N, Res); return SDValue(); } } return SDValue(); } /// Check if a vector extract from a target-specific shuffle of a load can be /// folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but /// shuffles have been custom lowered so we need to handle those here. static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue InVec = N->getOperand(0); SDValue EltNo = N->getOperand(1); EVT EltVT = N->getValueType(0); if (!isa(EltNo)) return SDValue(); EVT OriginalVT = InVec.getValueType(); // Peek through bitcasts, don't duplicate a load with other uses. InVec = peekThroughOneUseBitcasts(InVec); EVT CurrentVT = InVec.getValueType(); if (!CurrentVT.isVector() || CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); SmallVector ShuffleMask; SmallVector ShuffleOps; bool UnaryShuffle; if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, ShuffleOps, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast(EltNo)->getZExtValue(); int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; if (Idx == SM_SentinelZero) return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); if (Idx == SM_SentinelUndef) return DAG.getUNDEF(EltVT); assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1]; // If inputs to shuffle are the same for both ops, then allow 2 uses unsigned AllowedUses = (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) return SDValue(); AllowedUses = 1; // only allow 1 load use if we have a bitcast LdNode = LdNode.getOperand(0); } if (!ISD::isNormalLoad(LdNode.getNode())) return SDValue(); LoadSDNode *LN0 = cast(LdNode); if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); // If there's a bitcast before the shuffle, check if the load type and // alignment is valid. unsigned Align = LN0->getAlignment(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( EltVT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) return SDValue(); // All checks match so transform back to vector_shuffle so that DAG combiner // can finish the job SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1]; Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle, ShuffleMask); Shuffle = DAG.getBitcast(OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> // (i16 movmsk (16i8 sext (v16i1 x))) // before the illegal vector is scalarized on subtargets that don't have legal // vxi1 types. static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, const X86Subtarget &Subtarget) { EVT VT = BitCast.getValueType(); SDValue N0 = BitCast.getOperand(0); EVT VecVT = N0->getValueType(0); if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() && N0->getOpcode() == ISD::OR) { SDValue Op0 = N0->getOperand(0); SDValue Op1 = N0->getOperand(1); MVT TrunckVT; MVT BitcastVT; switch (VT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::v16i1: TrunckVT = MVT::i8; BitcastVT = MVT::v8i1; break; case MVT::v32i1: TrunckVT = MVT::i16; BitcastVT = MVT::v16i1; break; case MVT::v64i1: TrunckVT = MVT::i32; BitcastVT = MVT::v32i1; break; } bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL; bool isArg0UndefLeft = Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND; bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL; bool isArg1UndefLeft = Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND; SDValue OpLeft; SDValue OpRight; if (isArg0UndefRight && isArg1UndefLeft) { OpLeft = Op0; OpRight = Op1; } else if (isArg1UndefRight && isArg0UndefLeft) { OpLeft = Op1; OpRight = Op0; } else return SDValue(); SDLoc DL(BitCast); SDValue Shr = OpLeft->getOperand(0); SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr); SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1); SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight); SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2); } if (!VT.isScalarInteger() || !VecVT.isSimple()) return SDValue(); // With AVX512 vxi1 types are legal and we prefer using k-regs. // MOVMSK is supported in SSE2 or later. if (Subtarget.hasAVX512() || !Subtarget.hasSSE2()) return SDValue(); // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and // v8f64. So all legal 128-bit and 256-bit vectors are covered except for // v8i16 and v16i16. // For these two cases, we can shuffle the upper element bytes to a // consecutive sequence at the start of the vector and treat the results as // v16i8 or v32i8, and for v16i8 this is the preferable solution. However, // for v16i16 this is not the case, because the shuffle is expensive, so we // avoid sign-extending to this type entirely. // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE; switch (VecVT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::v2i1: SExtVT = MVT::v2i64; FPCastVT = MVT::v2f64; break; case MVT::v4i1: SExtVT = MVT::v4i32; FPCastVT = MVT::v4f32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && N0->getOperand(0).getValueType().is256BitVector()) { SExtVT = MVT::v4i64; FPCastVT = MVT::v4f64; } break; case MVT::v8i1: SExtVT = MVT::v8i16; // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)), // sign-extend to a 256-bit operation to match the compare. // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && (N0->getOperand(0).getValueType().is256BitVector() || N0->getOperand(0).getValueType().is512BitVector())) { SExtVT = MVT::v8i32; FPCastVT = MVT::v8f32; } break; case MVT::v16i1: SExtVT = MVT::v16i8; // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), // it is not profitable to sign-extend to 256-bit because this will // require an extra cross-lane shuffle which is more expensive than // truncating the result of the compare to 128-bits. break; case MVT::v32i1: SExtVT = MVT::v32i8; break; }; SDLoc DL(BitCast); SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT); if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) { // Handle pre-AVX2 cases by splitting to two v16i1's. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32); SDValue Lo = extract128BitVector(V, 0, DAG, DL); SDValue Hi = extract128BitVector(V, 16, DAG, DL); Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo); Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi); Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, DAG.getConstant(16, DL, ShiftTy)); V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi); return DAG.getZExtOrTrunc(V, DL, VT); } if (SExtVT == MVT::v8i16) { assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector"); V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V, DAG.getUNDEF(MVT::v8i16)); } else assert(SExtVT.getScalarType() != MVT::i16 && "Vectors of i16 must be packed"); if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE) V = DAG.getBitcast(FPCastVT, V); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); return DAG.getZExtOrTrunc(V, DL, VT); } static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SrcVT = N0.getValueType(); // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> // (i16 movmsk (16i8 sext (v16i1 x))) // before the setcc result is scalarized on subtargets that don't have legal // vxi1 types. if (DCI.isBeforeLegalize()) if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget)) return V; // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. // Detect bitcasts between i32 to x86mmx low word. if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) { SDValue N00 = N0->getOperand(0); if (N00.getValueType() == MVT::i32) return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); } // Detect bitcasts between element or subvector extraction to x86mmx. if (VT == MVT::x86mmx && (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) && isNullConstant(N0.getOperand(1))) { SDValue N00 = N0->getOperand(0); if (N00.getValueType().is128BitVector()) return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT, DAG.getBitcast(MVT::v2i64, N00)); } // Detect bitcasts from FP_TO_SINT to x86mmx. if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) { SDLoc DL(N0); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, DAG.getUNDEF(MVT::v2i32)); return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT, DAG.getBitcast(MVT::v2i64, Res)); } // Convert a bitcasted integer logic operation that has one bitcasted // floating-point operand into a floating-point logic operation. This may // create a load of a constant, but that is cheaper than materializing the // constant in an integer register and transferring it to an SSE register or // transferring the SSE operand to integer register and back. unsigned FPOpcode; switch (N0.getOpcode()) { case ISD::AND: FPOpcode = X86ISD::FAND; break; case ISD::OR: FPOpcode = X86ISD::FOR; break; case ISD::XOR: FPOpcode = X86ISD::FXOR; break; default: return SDValue(); } if (!((Subtarget.hasSSE1() && VT == MVT::f32) || (Subtarget.hasSSE2() && VT == MVT::f64))) return SDValue(); SDValue LogicOp0 = N0.getOperand(0); SDValue LogicOp1 = N0.getOperand(1); SDLoc DL0(N0); // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y)) if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST && LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT && !isa(LogicOp0.getOperand(0))) { SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1); return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1); } // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y) if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST && LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT && !isa(LogicOp1.getOperand(0))) { SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0); return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0); } return SDValue(); } // Match a binop + shuffle pyramid that represents a horizontal reduction over // the elements of a vector. // Returns the vector that is being reduced on, or SDValue() if a reduction // was not matched. static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp, ArrayRef CandidateBinOps) { // The pattern must end in an extract from index 0. if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) || !isNullConstant(Extract->getOperand(1))) return SDValue(); SDValue Op = Extract->getOperand(0); unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements()); // Match against one of the candidate binary ops. if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) { return Op.getOpcode() == unsigned(BinOp); })) return SDValue(); // At each stage, we're looking for something that looks like: // %s = shufflevector <8 x i32> %op, <8 x i32> undef, // <8 x i32> // %a = binop <8 x i32> %op, %s // Where the mask changes according to the stage. E.g. for a 3-stage pyramid, // we expect something like: // <4,5,6,7,u,u,u,u> // <2,3,u,u,u,u,u,u> // <1,u,u,u,u,u,u,u> unsigned CandidateBinOp = Op.getOpcode(); for (unsigned i = 0; i < Stages; ++i) { if (Op.getOpcode() != CandidateBinOp) return SDValue(); ShuffleVectorSDNode *Shuffle = dyn_cast(Op.getOperand(0).getNode()); if (Shuffle) { Op = Op.getOperand(1); } else { Shuffle = dyn_cast(Op.getOperand(1).getNode()); Op = Op.getOperand(0); } // The first operand of the shuffle should be the same as the other operand // of the binop. if (!Shuffle || Shuffle->getOperand(0) != Op) return SDValue(); // Verify the shuffle has the expected (at this stage of the pyramid) mask. for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index) if (Shuffle->getMaskElt(Index) != MaskEnd + Index) return SDValue(); } BinOp = CandidateBinOp; return Op; } // Given a select, detect the following pattern: // 1: %2 = zext %0 to // 2: %3 = zext %1 to // 3: %4 = sub nsw %2, %3 // 4: %5 = icmp sgt %4, [0 x N] or [-1 x N] // 5: %6 = sub nsw zeroinitializer, %4 // 6: %7 = select %5, %4, %6 // This is useful as it is the input into a SAD pattern. static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, SDValue &Op1) { // Check the condition of the select instruction is greater-than. SDValue SetCC = Select->getOperand(0); if (SetCC.getOpcode() != ISD::SETCC) return false; ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); if (CC != ISD::SETGT && CC != ISD::SETLT) return false; SDValue SelectOp1 = Select->getOperand(1); SDValue SelectOp2 = Select->getOperand(2); // The following instructions assume SelectOp1 is the subtraction operand // and SelectOp2 is the negation operand. // In the case of SETLT this is the other way around. if (CC == ISD::SETLT) std::swap(SelectOp1, SelectOp2); // The second operand of the select should be the negation of the first // operand, which is implemented as 0 - SelectOp1. if (!(SelectOp2.getOpcode() == ISD::SUB && ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) && SelectOp2.getOperand(1) == SelectOp1)) return false; // The first operand of SetCC is the first operand of the select, which is the // difference between the two input vectors. if (SetCC.getOperand(0) != SelectOp1) return false; // In SetLT case, The second operand of the comparison can be either 1 or 0. APInt SplatVal; if ((CC == ISD::SETLT) && !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && SplatVal.isOneValue()) || (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) return false; // In SetGT case, The second operand of the comparison can be either -1 or 0. if ((CC == ISD::SETGT) && !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) return false; // The first operand of the select is the difference between the two input // vectors. if (SelectOp1.getOpcode() != ISD::SUB) return false; Op0 = SelectOp1.getOperand(0); Op1 = SelectOp1.getOperand(1); // Check if the operands of the sub are zero-extended from vectors of i8. if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || Op1.getOpcode() != ISD::ZERO_EXTEND || Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) return false; return true; } // Given two zexts of to , create a PSADBW of the inputs // to these zexts. static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL) { // Find the appropriate width for the PSADBW. EVT InVT = Zext0.getOperand(0).getValueType(); unsigned RegSize = std::max(128u, InVT.getSizeInBits()); // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we // fill in the missing vector elements with 0. unsigned NumConcat = RegSize / InVT.getSizeInBits(); SmallVector Ops(NumConcat, DAG.getConstant(0, DL, InVT)); Ops[0] = Zext0.getOperand(0); MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); Ops[0] = Zext1.getOperand(0); SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); // Actually build the SAD MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with // PHMINPOSUW. static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE41. if (!Subtarget.hasSSE41()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8) return SDValue(); // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. unsigned BinOp; SDValue Src = matchBinOpReduction( Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}); if (!Src) return SDValue(); EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getScalarType(); if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0) return SDValue(); SDLoc DL(Extract); SDValue MinPos = Src; // First, reduce the source down to 128-bit, applying BinOp to lo/hi. while (SrcVT.getSizeInBits() > 128) { unsigned NumElts = SrcVT.getVectorNumElements(); unsigned NumSubElts = NumElts / 2; SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts); unsigned SubSizeInBits = SrcVT.getSizeInBits(); SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits); SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits); MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); } assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && "Unexpected value type"); // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask // to flip the value accordingly. SDValue Mask; unsigned MaskEltsBits = ExtractVT.getSizeInBits(); if (BinOp == ISD::SMAX) Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::SMIN) Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::UMAX) Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); // For v16i8 cases we need to perform UMIN on pairs of byte elements, // shuffling each upper element down and insert zeros. This means that the // v16i8 UMIN will leave the upper element as zero, performing zero-extension // ready for the PHMINPOS. if (ExtractVT == MVT::i8) { SDValue Upper = DAG.getVectorShuffle( SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL), {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); } // Perform the PHMINPOS on a v8i16 vector, MinPos = DAG.getBitcast(MVT::v8i16, MinPos); MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos); MinPos = DAG.getBitcast(SrcVT, MinPos); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos, DAG.getIntPtrConstant(0, DL)); } // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. static SDValue combineHorizontalPredicateResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE2 or with AVX512VL (which uses predicate registers). if (!Subtarget.hasSSE2() || Subtarget.hasVLX()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); unsigned BitWidth = ExtractVT.getSizeInBits(); if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && ExtractVT != MVT::i8) return SDValue(); // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. unsigned BinOp = 0; SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); if (!Match) return SDValue(); // EXTRACT_VECTOR_ELT can require implicit extension of the vector element // which we can't support here for now. if (Match.getScalarValueSizeInBits() != BitWidth) return SDValue(); // We require AVX2 for PMOVMSKB for v16i16/v32i8; unsigned MatchSizeInBits = Match.getValueSizeInBits(); if (!(MatchSizeInBits == 128 || (MatchSizeInBits == 256 && ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) return SDValue(); // Don't bother performing this for 2-element vectors. if (Match.getValueType().getVectorNumElements() <= 2) return SDValue(); // Check that we are extracting a reduction of all sign bits. if (DAG.ComputeNumSignBits(Match) != BitWidth) return SDValue(); // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. MVT MaskVT; if (64 == BitWidth || 32 == BitWidth) MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), MatchSizeInBits / BitWidth); else MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); APInt CompareBits; ISD::CondCode CondCode; if (BinOp == ISD::OR) { // any_of -> MOVMSK != 0 CompareBits = APInt::getNullValue(32); CondCode = ISD::CondCode::SETNE; } else { // all_of -> MOVMSK == ((1 << NumElts) - 1) CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); CondCode = ISD::CondCode::SETEQ; } // Perform the select as i32/i64 and then truncate to avoid partial register // stalls. unsigned ResWidth = std::max(BitWidth, 32u); EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); SDLoc DL(Extract); SDValue Zero = DAG.getConstant(0, DL, ResVT); SDValue Ones = DAG.getAllOnesConstant(DL, ResVT); SDValue Res = DAG.getBitcast(MaskVT, Match); Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), Ones, Zero, CondCode); return DAG.getSExtOrTrunc(Res, DL, ExtractVT); } static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. if (!Subtarget.hasSSE2()) return SDValue(); // Verify the type we're extracting from is any integer type above i16. EVT VT = Extract->getOperand(0).getValueType(); if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) return SDValue(); unsigned RegSize = 128; if (Subtarget.hasBWI()) RegSize = 512; else if (Subtarget.hasAVX2()) RegSize = 256; // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. if (RegSize / VT.getVectorNumElements() < 8) return SDValue(); // Match shuffle + add pyramid. unsigned BinOp = 0; SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD}); // The operand is expected to be zero extended from i8 // (verified in detectZextAbsDiff). // In order to convert to i64 and above, additional any/zero/sign // extend is expected. // The zero extend from 32 bit has no mathematical effect on the result. // Also the sign extend is basically zero extend // (extends the sign bit which is zero). // So it is correct to skip the sign/zero extend instruction. if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || Root.getOpcode() == ISD::ZERO_EXTEND || Root.getOpcode() == ISD::ANY_EXTEND)) Root = Root.getOperand(0); // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. if (!Root || (Root.getOpcode() != ISD::VSELECT)) return SDValue(); // Check whether we have an abs-diff pattern feeding into the select. SDValue Zext0, Zext1; if (!detectZextAbsDiff(Root, Zext0, Zext1)) return SDValue(); // Create the SAD instruction. SDLoc DL(Extract); SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL); // If the original vector was wider than 8 elements, sum over the results // in the SAD vector. unsigned Stages = Log2_32(VT.getVectorNumElements()); MVT SadVT = SAD.getSimpleValueType(); if (Stages > 3) { unsigned SadElems = SadVT.getVectorNumElements(); for(unsigned i = Stages - 3; i > 0; --i) { SmallVector Mask(SadElems, -1); for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) Mask[j] = MaskEnd + j; SDValue Shuffle = DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask); SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle); } } MVT Type = Extract->getSimpleValueType(0); unsigned TypeSizeInBits = Type.getSizeInBits(); // Return the lowest TypeSizeInBits bits. MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); SAD = DAG.getBitcast(ResVT, SAD); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, Extract->getOperand(1)); } // Attempt to peek through a target shuffle and extract the scalar from the // source. static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue Src = N->getOperand(0); SDValue Idx = N->getOperand(1); EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getVectorElementType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); // Don't attempt this for boolean mask vectors or unknown extraction indices. if (SrcSVT == MVT::i1 || !isa(Idx)) return SDValue(); // Resolve the target shuffle inputs and mask. SmallVector Mask; SmallVector Ops; if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. if (Mask.size() != NumSrcElts) { if ((NumSrcElts % Mask.size()) == 0) { SmallVector ScaledMask; int Scale = NumSrcElts / Mask.size(); scaleShuffleMask(Scale, Mask, ScaledMask); Mask = std::move(ScaledMask); } else if ((Mask.size() % NumSrcElts) == 0) { SmallVector WidenedMask; while (Mask.size() > NumSrcElts && canWidenShuffleElements(Mask, WidenedMask)) Mask = std::move(WidenedMask); // TODO - investigate support for wider shuffle masks with known upper // undef/zero elements for implicit zero-extension. } } // Check if narrowing/widening failed. if (Mask.size() != NumSrcElts) return SDValue(); int SrcIdx = Mask[N->getConstantOperandVal(1)]; SDLoc dl(N); // If the shuffle source element is undef/zero then we can just accept it. if (SrcIdx == SM_SentinelUndef) return DAG.getUNDEF(VT); if (SrcIdx == SM_SentinelZero) return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT) : DAG.getConstant(0, dl, VT); SDValue SrcOp = Ops[SrcIdx / Mask.size()]; SrcOp = DAG.getBitcast(SrcVT, SrcOp); SrcIdx = SrcIdx % Mask.size(); // We can only extract other elements from 128-bit vectors and in certain // circumstances, depending on SSE-level. // TODO: Investigate using extract_subvector for larger vectors. // TODO: Investigate float/double extraction if it will be just stored. if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) && ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { assert(SrcSVT == VT && "Unexpected extraction type"); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); } if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) || (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) { assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"); unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); return DAG.getZExtOrTrunc(ExtOp, dl, VT); } return SDValue(); } /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading /// scalars back, while for x64 we should use 64-bit extracts and shifts. static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) return NewOp; // TODO - Remove this once we can handle the implicit zero-extension of // X86ISD::PEXTRW/X86ISD::PEXTRB in: // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and // combineBasicSADPattern. if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; SDValue InputVector = N->getOperand(0); SDValue EltIdx = N->getOperand(1); EVT SrcVT = InputVector.getValueType(); EVT VT = N->getValueType(0); SDLoc dl(InputVector); // Detect mmx extraction of all bits as a i64. It works better as a bitcast. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { SDValue MMXSrc = InputVector.getOperand(0); // The bitcast source is a direct mmx result. if (MMXSrc.getValueType() == MVT::x86mmx) return DAG.getBitcast(VT, InputVector); } // Detect mmx to i32 conversion through a v2i32 elt extract. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) { SDValue MMXSrc = InputVector.getOperand(0); // The bitcast source is a direct mmx result. if (MMXSrc.getValueType() == MVT::x86mmx) return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); } if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST && isa(EltIdx) && isa(InputVector.getOperand(0))) { uint64_t ExtractedElt = N->getConstantOperandVal(1); uint64_t InputValue = InputVector.getConstantOperandVal(0); uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } // Check whether this extract is the root of a sum of absolute differences // pattern. This has to be done here because we really want it to happen // pre-legalization, if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) return Cmp; // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW. if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (SrcVT != MVT::v4i32) return SDValue(); // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a // single use which is a sign-extend or zero-extend, and all elements are // used. SmallVector Uses; unsigned ExtractedElements = 0; for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { if (UI.getUse().getResNo() != InputVector.getResNo()) return SDValue(); SDNode *Extract = *UI; if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); if (Extract->getValueType(0) != MVT::i32) return SDValue(); if (!Extract->hasOneUse()) return SDValue(); if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) return SDValue(); if (!isa(Extract->getOperand(1))) return SDValue(); // Record which element was extracted. ExtractedElements |= 1 << Extract->getConstantOperandVal(1); Uses.push_back(Extract); } // If not all the elements were used, this may not be worthwhile. if (ExtractedElements != 15) return SDValue(); // Ok, we've now decided to do the transformation. // If 64-bit shifts are legal, use the extract-shift sequence, // otherwise bounce the vector off the cache. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Vals[4]; if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector); auto &DL = DAG.getDataLayout(); EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL); SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(0, dl, VecIdxTy)); SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, DAG.getConstant(1, dl, VecIdxTy)); SDValue ShAmt = DAG.getConstant( 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL)); Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); } else { // Store the value to a temporary stack slot. SDValue StackPtr = DAG.CreateStackTemporary(SrcVT); SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, MachinePointerInfo()); EVT ElementType = SrcVT.getVectorElementType(); unsigned EltSize = ElementType.getSizeInBits() / 8; // Replace each use (extract) with a load of the appropriate element. for (unsigned i = 0; i < 4; ++i) { uint64_t Offset = EltSize * i; auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT); SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal); // Load the scalar. Vals[i] = DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo()); } } // Replace the extracts for (SmallVectorImpl::iterator UI = Uses.begin(), UE = Uses.end(); UI != UE; ++UI) { SDNode *Extract = *UI; uint64_t IdxVal = Extract->getConstantOperandVal(1); DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } // The replacement was made in place; don't return anything. return SDValue(); } /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()? static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); SDLoc DL(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (N->getOpcode() != ISD::VSELECT) return SDValue(); assert(CondVT.isVector() && "Vector select expects a vector selector!"); bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); // Check if the first operand is all zeros and Cond type is vXi1. // This situation only applies to avx512. if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getAllOnesConstant(DL, CondVT)); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 return DAG.getSelect(DL, VT, CondNew, RHS, LHS); } // To use the condition operand as a bitwise mask, it must have elements that // are the same size as the select elements. Ie, the condition operand must // have already been promoted from the IR select condition type . // Don't check if the types themselves are equal because that excludes // vector floating-point selects. if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); // Try to invert the condition if true value is not all 1s and false value is // not all 0s. if (!TValIsAllOnes && !FValIsAllZeros && // Check if the selector will be produced by CMPP*/PCMP*. Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted. TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == CondVT) { bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); if (TValIsAllZeros || FValIsAllOnes) { SDValue CC = Cond.getOperand(2); ISD::CondCode NewCC = ISD::getSetCCInverse(cast(CC)->get(), Cond.getOperand(0).getValueType().isInteger()); Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); std::swap(LHS, RHS); TValIsAllOnes = FValIsAllOnes; FValIsAllZeros = TValIsAllZeros; } } // Cond value must be 'sign splat' to be converted to a logical op. if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits()) return SDValue(); // vselect Cond, 111..., 000... -> Cond if (TValIsAllOnes && FValIsAllZeros) return DAG.getBitcast(VT, Cond); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT)) return SDValue(); // vselect Cond, 111..., X -> or Cond, X if (TValIsAllOnes) { SDValue CastRHS = DAG.getBitcast(CondVT, RHS); SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS); return DAG.getBitcast(VT, Or); } // vselect Cond, X, 000... -> and Cond, X if (FValIsAllZeros) { SDValue CastLHS = DAG.getBitcast(CondVT, LHS); SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS); return DAG.getBitcast(VT, And); } // vselect Cond, 000..., X -> andn Cond, X if (TValIsAllZeros) { MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64); SDValue CastCond = DAG.getBitcast(AndNVT, Cond); SDValue CastRHS = DAG.getBitcast(AndNVT, RHS); SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS); return DAG.getBitcast(VT, AndN); } return SDValue(); } static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); SDLoc DL(N); auto *TrueC = dyn_cast(LHS); auto *FalseC = dyn_cast(RHS); if (!TrueC || !FalseC) return SDValue(); // Don't do this for crazy integer types. EVT VT = N->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); // We're going to use the condition bit in math or logic ops. We could allow // this with a wider condition value (post-legalization it becomes an i8), // but if nothing is creating selects that late, it doesn't matter. if (Cond.getValueType() != MVT::i1) return SDValue(); // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by // 3, 5, or 9 with i32/i64, so those get transformed too. // TODO: For constants that overflow or do not differ by power-of-2 or small // multiplier, convert to 'and' + 'add'. const APInt &TrueVal = TrueC->getAPIntValue(); const APInt &FalseVal = FalseC->getAPIntValue(); bool OV; APInt Diff = TrueVal.ssub_ov(FalseVal, OV); if (OV) return SDValue(); APInt AbsDiff = Diff.abs(); if (AbsDiff.isPowerOf2() || ((VT == MVT::i32 || VT == MVT::i64) && (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) { // We need a positive multiplier constant for shift/LEA codegen. The 'not' // of the condition can usually be folded into a compare predicate, but even // without that, the sequence should be cheaper than a CMOV alternative. if (TrueVal.slt(FalseVal)) { Cond = DAG.getNOT(DL, Cond, MVT::i1); std::swap(TrueC, FalseC); } // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); // Multiply condition by the difference if non-one. if (!AbsDiff.isOneValue()) R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT)); // Add the base if non-zero. if (!FalseC->isNullValue()) R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0)); return R; } return SDValue(); } // If this is a bitcasted op that can be represented as another type, push the // the bitcast to the inputs. This allows more opportunities for pattern // matching masked instructions. This is called when we know that the operation // is used as one of the inputs of a vselect. static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // Make sure we have a bitcast. if (OrigOp.getOpcode() != ISD::BITCAST) return false; SDValue Op = OrigOp.getOperand(0); // If the operation is used by anything other than the bitcast, we shouldn't // do this combine as that would replicate the operation. if (!Op.hasOneUse()) return false; MVT VT = OrigOp.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); SDLoc DL(Op.getNode()); auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1, SDValue Op2) { Op0 = DAG.getBitcast(VT, Op0); DCI.AddToWorklist(Op0.getNode()); Op1 = DAG.getBitcast(VT, Op1); DCI.AddToWorklist(Op1.getNode()); DCI.CombineTo(OrigOp.getNode(), DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2)); return true; }; unsigned Opcode = Op.getOpcode(); switch (Opcode) { case X86ISD::SHUF128: { if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64) return false; // Only change element size, not type. if (VT.isInteger() != Op.getSimpleValueType().isInteger()) return false; return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } case X86ISD::SUBV_BROADCAST: { unsigned EltSize = EltVT.getSizeInBits(); if (EltSize != 32 && EltSize != 64) return false; // Only change element size, not type. if (VT.isInteger() != Op.getSimpleValueType().isInteger()) return false; SDValue Op0 = Op.getOperand(0); MVT Op0VT = MVT::getVectorVT(EltVT, Op0.getSimpleValueType().getSizeInBits() / EltSize); Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0)); DCI.AddToWorklist(Op0.getNode()); DCI.CombineTo(OrigOp.getNode(), DAG.getNode(Opcode, DL, VT, Op0)); return true; } } return false; } /// Do target-specific dag combines on SELECT and VSELECT nodes. static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have SSE[12] support, try to form min/max nodes. SSE min/max // instructions match the semantics of the common C idiom x(Cond.getOperand(2))->get(); unsigned Opcode = 0; // Check for x CC y ? x : y. if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { switch (CC) { default: break; case ISD::SETULT: // Converting this to a min would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMIN; break; case ISD::SETULE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMIN; break; case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETUGT: // Converting this to a max would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETUGE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMAX; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && DAG.isEqualTo(RHS, Cond.getOperand(0))) { switch (CC) { default: break; case ISD::SETOGE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; Opcode = X86ISD::FMIN; break; case ISD::SETUGE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMIN; break; case ISD::SETULT: // Converting this to a max would handle NaNs incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETOLE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETULE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMAX; break; } } if (Opcode) return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } // v16i8 (select v16i1, v16i8, v16i8) does not have a proper // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. // The same situation for all 128 and 256-bit vectors of i8 and i16. // Since SKX these selects have a proper lowering. if (Subtarget.hasAVX512() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1 && (VT.is128BitVector() || VT.is256BitVector()) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && !(Subtarget.hasBWI() && Subtarget.hasVLX())) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); DCI.AddToWorklist(Cond.getNode()); return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS); } if (SDValue V = combineSelectOfTwoConstants(N, DAG)) return V; // Canonicalize max and min: // (x > y) ? x : y -> (x >= y) ? x : y // (x < y) ? x : y -> (x <= y) ? x : y // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates // the need for an extra compare // against zero. e.g. // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 // subl %esi, %edi // testl %edi, %edi // movl $0, %eax // cmovgl %edi, %eax // => // xorl %eax, %eax // subl %esi, $edi // cmovsl %eax, %edi if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); switch (CC) { default: break; case ISD::SETLT: case ISD::SETGT: { ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); return DAG.getSelect(DL, VT, Cond, LHS, RHS); } } } // Early exit check if (!TLI.isTypeLegal(VT)) return SDValue(); // Match VSELECTs into subs with unsigned saturation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); // Check if one of the arms of the VSELECT is a zero vector. If it's on the // left side invert the predicate to simplify logic below. SDValue Other; if (ISD::isBuildVectorAllZeros(LHS.getNode())) { Other = RHS; CC = ISD::getSetCCInverse(CC, true); } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { Other = LHS; } if (Other.getNode() && Other->getNumOperands() == 2 && DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); SDValue CondRHS = Cond->getOperand(1); // Look for a general sub with unsigned saturation first. // x >= y ? x-y : 0 --> subus x, y // x > y ? x-y : 0 --> subus x, y if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); if (auto *OpRHSBV = dyn_cast(OpRHS)) if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { if (auto *CondRHSBV = dyn_cast(CondRHS)) if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && CondRHSConst->getAPIntValue() == (-OpRHSConst->getAPIntValue() - 1)) return DAG.getNode( X86ISD::SUBUS, DL, VT, OpLHS, DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT)); // Another special case: If C was a sign bit, the sub has been // canonicalized into a xor. // FIXME: Would it be better to use computeKnownBits to determine // whether it's safe to decanonicalize the xor? // x s< 0 ? x^C : 0 --> subus x, C if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && ISD::isBuildVectorAllZeros(CondRHS.getNode()) && OpRHSConst->getAPIntValue().isSignMask()) // Note that we have to rebuild the RHS constant here to ensure we // don't rely on particular values of undef lanes. return DAG.getNode( X86ISD::SUBUS, DL, VT, OpLHS, DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT)); } } } if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) return V; // If this is a *dynamic* select (non-constant condition) and we can match // this node with one of the variable blend instructions, restructure the // condition so that blends can use the high (sign) bit of each element and // use SimplifyDemandedBits to simplify the condition operand. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { unsigned BitWidth = Cond.getScalarValueSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) return SDValue(); // We can only handle the cases where VSELECT is directly legal on the // subtarget. We custom lower VSELECT nodes with constant conditions and // this makes it hard to see whether a dynamic VSELECT will correctly // lower, so we both check the operation's status and explicitly handle the // cases where a *dynamic* blend will fail even though a constant-condition // blend could be custom lowered. // FIXME: We should find a better way to handle this class of problems. // Potentially, we should combine constant-condition vselect nodes // pre-legalization into shuffles and not mark as many types as custom // lowered. if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) return SDValue(); // FIXME: We don't support i16-element blends currently. We could and // should support them by making *all* the bits in the condition be set // rather than just the high bit and using an i8-element blend. if (VT.getVectorElementType() == MVT::i16) return SDValue(); // Dynamic blending was only available from SSE4.1 onward. if (VT.is128BitVector() && !Subtarget.hasSSE41()) return SDValue(); // Byte blends are only available in AVX2 if (VT == MVT::v32i8 && !Subtarget.hasAVX2()) return SDValue(); // There are no 512-bit blend instructions that use sign bits. if (VT.is512BitVector()) return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask(APInt::getSignMask(BitWidth)); KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) || TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) { // If we changed the computation somewhere in the DAG, this change will // affect all users of Cond. Make sure it is fine and update all the nodes // so that we do not use the generic VSELECT anymore. Otherwise, we may // perform wrong optimizations as we messed with the actual expectation // for the vector boolean values. if (Cond != TLO.Old) { // Check all uses of the condition operand to check whether it will be // consumed by non-BLEND instructions. Those may require that all bits // are set properly. for (SDNode *U : Cond->uses()) { // TODO: Add other opcodes eventually lowered into BLEND. if (U->getOpcode() != ISD::VSELECT) return SDValue(); } // Update all users of the condition before committing the change, so // that the VSELECT optimizations that expect the correct vector boolean // value will not be triggered. for (SDNode *U : Cond->uses()) { SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0), Cond, U->getOperand(1), U->getOperand(2)); DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); } DCI.CommitTargetLoweringOpt(TLO); return SDValue(); } // Only Cond (rather than other nodes in the computation chain) was // changed. Change the condition just for N to keep the opportunity to // optimize all other users their own way. SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB); return SDValue(); } } // Look for vselects with LHS/RHS being bitcasted from an operation that // can be executed on another type. Push the bitcast to the inputs of // the operation. This exposes opportunities for using masking instructions. if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() && CondVT.getVectorElementType() == MVT::i1) { if (combineBitcastForMaskedOp(LHS, DAG, DCI)) return SDValue(N, 0); if (combineBitcastForMaskedOp(RHS, DAG, DCI)) return SDValue(N, 0); } // Custom action for SELECT MMX if (VT == MVT::x86mmx) { LHS = DAG.getBitcast(MVT::i64, LHS); RHS = DAG.getBitcast(MVT::i64, RHS); SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS); return DAG.getBitcast(VT, newSelect); } return SDValue(); } /// Combine: /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) /// to: /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE) /// i.e., reusing the EFLAGS produced by the LOCKed instruction. /// Note that this is only legal for some op/cc combinations. static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // This combine only operates on CMP-like nodes. if (!(Cmp.getOpcode() == X86ISD::CMP || (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) return SDValue(); // Can't replace the cmp if it has more uses than the one we're looking at. // FIXME: We would like to be able to handle this, but would need to make sure // all uses were updated. if (!Cmp.hasOneUse()) return SDValue(); // This only applies to variations of the common case: // (icmp slt x, 0) -> (icmp sle (add x, 1), 0) // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0) // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0) // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0) // Using the proper condcodes (see below), overflow is checked for. // FIXME: We can generalize both constraints: // - XOR/OR/AND (if they were made to survive AtomicExpand) // - LHS != 1 // if the result is compared. SDValue CmpLHS = Cmp.getOperand(0); SDValue CmpRHS = Cmp.getOperand(1); if (!CmpLHS.hasOneUse()) return SDValue(); unsigned Opc = CmpLHS.getOpcode(); if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB) return SDValue(); SDValue OpRHS = CmpLHS.getOperand(2); auto *OpRHSC = dyn_cast(OpRHS); if (!OpRHSC) return SDValue(); APInt Addend = OpRHSC->getAPIntValue(); if (Opc == ISD::ATOMIC_LOAD_SUB) Addend = -Addend; auto *CmpRHSC = dyn_cast(CmpRHS); if (!CmpRHSC) return SDValue(); APInt Comparison = CmpRHSC->getAPIntValue(); // If the addend is the negation of the comparison value, then we can do // a full comparison by emitting the atomic arithmetic as a locked sub. if (Comparison == -Addend) { // The CC is fine, but we need to rewrite the LHS of the comparison as an // atomic sub. auto *AN = cast(CmpLHS.getNode()); auto AtomicSub = DAG.getAtomic( ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(), /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1), /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()), AN->getMemOperand()); // If the comparision uses the CF flag we can't use INC/DEC instructions. bool NeedCF = false; switch (CC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: NeedCF = true; break; } auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpLHS.getValueType())); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); return LockOp; } // We can handle comparisons with zero in a number of cases by manipulating // the CC used. if (!Comparison.isNullValue()) return SDValue(); if (CC == X86::COND_S && Addend == 1) CC = X86::COND_LE; else if (CC == X86::COND_NS && Addend == 1) CC = X86::COND_G; else if (CC == X86::COND_G && Addend == -1) CC = X86::COND_GE; else if (CC == X86::COND_LE && Addend == -1) CC = X86::COND_L; else return SDValue(); SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpLHS.getValueType())); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); return LockOp; } // Check whether a boolean test is testing a boolean value generated by // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition // code. // // Simplify the following patterns: // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) // to (Op EFLAGS Cond) // // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) // to (Op EFLAGS !Cond) // // where Op could be BRCOND or CMOV. // static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // This combine only operates on CMP-like nodes. if (!(Cmp.getOpcode() == X86ISD::CMP || (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) return SDValue(); // Quit if not used as a boolean value. if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); // Check CMP operands. One of them should be 0 or 1 and the other should be // an SetCC or extended from it. SDValue Op1 = Cmp.getOperand(0); SDValue Op2 = Cmp.getOperand(1); SDValue SetCC; const ConstantSDNode* C = nullptr; bool needOppositeCond = (CC == X86::COND_E); bool checkAgainstTrue = false; // Is it a comparison against 1? if ((C = dyn_cast(Op1))) SetCC = Op2; else if ((C = dyn_cast(Op2))) SetCC = Op1; else // Quit if all operands are not constants. return SDValue(); if (C->getZExtValue() == 1) { needOppositeCond = !needOppositeCond; checkAgainstTrue = true; } else if (C->getZExtValue() != 0) // Quit if the constant is neither 0 or 1. return SDValue(); bool truncatedToBoolWithAnd = false; // Skip (zext $x), (trunc $x), or (and $x, 1) node. while (SetCC.getOpcode() == ISD::ZERO_EXTEND || SetCC.getOpcode() == ISD::TRUNCATE || SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; if (isOneConstant(SetCC.getOperand(0))) OpIdx = 1; if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; if (OpIdx < 0) break; SetCC = SetCC.getOperand(OpIdx); truncatedToBoolWithAnd = true; } else SetCC = SetCC.getOperand(0); } switch (SetCC.getOpcode()) { case X86ISD::SETCC_CARRY: // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, // i.e. it's a comparison against true but the result of SETCC_CARRY is not // truncated to i1 using 'and'. if (checkAgainstTrue && !truncatedToBoolWithAnd) break; assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!"); LLVM_FALLTHROUGH; case X86ISD::SETCC: // Set the condition code or opposite one if necessary. CC = X86::CondCode(SetCC.getConstantOperandVal(0)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(1); case X86ISD::CMOV: { // Check whether false/true value has canonical one, i.e. 0 or 1. ConstantSDNode *FVal = dyn_cast(SetCC.getOperand(0)); ConstantSDNode *TVal = dyn_cast(SetCC.getOperand(1)); // Quit if true value is not a constant. if (!TVal) return SDValue(); // Quit if false value is not a constant. if (!FVal) { SDValue Op = SetCC.getOperand(0); // Skip 'zext' or 'trunc' node. if (Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::TRUNCATE) Op = Op.getOperand(0); // A special case for rdrand/rdseed, where 0 is set if false cond is // found. if ((Op.getOpcode() != X86ISD::RDRAND && Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) return SDValue(); } // Quit if false value is not the constant 0 or 1. bool FValIsFalse = true; if (FVal && FVal->getZExtValue() != 0) { if (FVal->getZExtValue() != 1) return SDValue(); // If FVal is 1, opposite cond is needed. needOppositeCond = !needOppositeCond; FValIsFalse = false; } // Quit if TVal is not the constant opposite of FVal. if (FValIsFalse && TVal->getZExtValue() != 1) return SDValue(); if (!FValIsFalse && TVal->getZExtValue() != 0) return SDValue(); CC = X86::CondCode(SetCC.getConstantOperandVal(2)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(3); } } return SDValue(); } /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. /// Match: /// (X86or (X86setcc) (X86setcc)) /// (X86cmp (and (X86setcc) (X86setcc)), 0) static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd) { if (Cond->getOpcode() == X86ISD::CMP) { if (!isNullConstant(Cond->getOperand(1))) return false; Cond = Cond->getOperand(0); } isAnd = false; SDValue SetCC0, SetCC1; switch (Cond->getOpcode()) { default: return false; case ISD::AND: case X86ISD::AND: isAnd = true; LLVM_FALLTHROUGH; case ISD::OR: case X86ISD::OR: SetCC0 = Cond->getOperand(0); SetCC1 = Cond->getOperand(1); break; }; // Make sure we have SETCC nodes, using the same flags value. if (SetCC0.getOpcode() != X86ISD::SETCC || SetCC1.getOpcode() != X86ISD::SETCC || SetCC0->getOperand(1) != SetCC1->getOperand(1)) return false; CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); Flags = SetCC0->getOperand(1); return true; } // When legalizing carry, we create carries via add X, -1 // If that comes from an actual carry, via setcc, we use the // carry directly. static SDValue combineCarryThroughADD(SDValue EFLAGS) { if (EFLAGS.getOpcode() == X86ISD::ADD) { if (isAllOnesConstant(EFLAGS.getOperand(1))) { SDValue Carry = EFLAGS.getOperand(0); while (Carry.getOpcode() == ISD::TRUNCATE || Carry.getOpcode() == ISD::ZERO_EXTEND || Carry.getOpcode() == ISD::SIGN_EXTEND || Carry.getOpcode() == ISD::ANY_EXTEND || (Carry.getOpcode() == ISD::AND && isOneConstant(Carry.getOperand(1)))) Carry = Carry.getOperand(0); if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { if (Carry.getConstantOperandVal(0) == X86::COND_B) return Carry.getOperand(1); } } } return SDValue(); } /// Optimize an EFLAGS definition used according to the condition code \p CC /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing /// uses of chain values. static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (CC == X86::COND_B) if (SDValue Flags = combineCarryThroughADD(EFLAGS)) return Flags; if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) return R; return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue FalseOp = N->getOperand(0); SDValue TrueOp = N->getOperand(1); X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); if (CC == X86::COND_E || CC == X86::COND_NE) { switch (Cond.getOpcode()) { default: break; case X86ISD::BSR: case X86ISD::BSF: // If operand of BSR / BSF are proven never zero, then ZF cannot be set. if (DAG.isKnownNeverZero(Cond.getOperand(0))) return (CC == X86::COND_E) ? FalseOp : TrueOp; } } // Try to simplify the EFLAGS and condition code operands. // We can't always do this as FCMOV only supports a subset of X86 cond. if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), Flags}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. if (ConstantSDNode *TrueC = dyn_cast(TrueOp)) { if (ConstantSDNode *FalseC = dyn_cast(FalseOp)) { // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is // larger than FalseC (the false value). if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); std::swap(TrueOp, FalseOp); } // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. // This is efficient for any integer data type (including i8/i16) and // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); unsigned ShAmt = TrueC->getAPIntValue().logBase2(); Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, DAG.getConstant(ShAmt, DL, MVT::i8)); return Cond; } // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); return Cond; } // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; bool isFastMultiplier = false; if (Diff < 10) { switch ((unsigned char)Diff) { default: break; case 1: // result = add base, cond case 2: // result = lea base( , cond*2) case 3: // result = lea base(cond, cond*2) case 4: // result = lea base( , cond*4) case 5: // result = lea base(cond, cond*4) case 8: // result = lea base( , cond*8) case 9: // result = lea base(cond, cond*8) isFastMultiplier = true; break; } } if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); Cond = getSETCC(CC, Cond, DL ,DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, DAG.getConstant(Diff, DL, Cond.getValueType())); // Add the base if non-zero. if (FalseC->getAPIntValue() != 0) Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); return Cond; } } } } // Handle these cases: // (select (x != c), e, c) -> select (x != c), e, x), // (select (x == c), c, e) -> select (x == c), x, e) // where the c is an integer constant, and the "select" is the combination // of CMOV and CMP. // // The rationale for this change is that the conditional-move from a constant // needs two instructions, however, conditional-move from a register needs // only one instruction. // // CAVEAT: By replacing a constant with a symbolic value, it may obscure // some instruction-combining opportunities. This opt needs to be // postponed as late as possible. // if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { // the DCI.xxxx conditions are provided to postpone the optimization as // late as possible. ConstantSDNode *CmpAgainst = nullptr; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast(Cond.getOperand(1))) && !isa(Cond.getOperand(0))) { if (CC == X86::COND_NE && CmpAgainst == dyn_cast(FalseOp)) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueOp, FalseOp); } if (CC == X86::COND_E && CmpAgainst == dyn_cast(TrueOp)) { SDValue Ops[] = { FalseOp, Cond.getOperand(0), DAG.getConstant(CC, DL, MVT::i8), Cond }; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } } // Fold and/or of setcc's to double CMOV: // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) // // This combine lets us generate: // cmovcc1 (jcc1 if we don't have CMOV) // cmovcc2 (same) // instead of: // setcc1 // setcc2 // and/or // cmovne (jne if we don't have CMOV) // When we can't use the CMOV instruction, it might increase branch // mispredicts. // When we can use CMOV, or when there is no mispredict, this improves // throughput and reduces register pressure. // if (CC == X86::COND_NE) { SDValue Flags; X86::CondCode CC0, CC1; bool isAndSetCC; if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { if (isAndSetCC) { std::swap(FalseOp, TrueOp); CC0 = X86::GetOppositeBranchCondition(CC0); CC1 = X86::GetOppositeBranchCondition(CC1); } SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), Flags}; SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps); SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); return CMOV; } } return SDValue(); } /// Different mul shrinking modes. enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { EVT VT = N->getOperand(0).getValueType(); if (VT.getScalarSizeInBits() != 32) return false; assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2"); unsigned SignBits[2] = {1, 1}; bool IsPositive[2] = {false, false}; for (unsigned i = 0; i < 2; i++) { SDValue Opd = N->getOperand(i); // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to // compute signbits for it separately. if (Opd.getOpcode() == ISD::ANY_EXTEND) { // For anyextend, it is safe to assume an appropriate number of leading // sign/zero bits. if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8) SignBits[i] = 25; else if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i16) SignBits[i] = 17; else return false; IsPositive[i] = true; } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) { // All the operands of BUILD_VECTOR need to be int constant. // Find the smallest value range which all the operands belong to. SignBits[i] = 32; IsPositive[i] = true; for (const SDValue &SubOp : Opd.getNode()->op_values()) { if (SubOp.isUndef()) continue; auto *CN = dyn_cast(SubOp); if (!CN) return false; APInt IntVal = CN->getAPIntValue(); if (IntVal.isNegative()) IsPositive[i] = false; SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits()); } } else { SignBits[i] = DAG.ComputeNumSignBits(Opd); if (Opd.getOpcode() == ISD::ZERO_EXTEND) IsPositive[i] = true; } } bool AllPositive = IsPositive[0] && IsPositive[1]; unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); // When ranges are from -128 ~ 127, use MULS8 mode. if (MinSignBits >= 25) Mode = MULS8; // When ranges are from 0 ~ 255, use MULU8 mode. else if (AllPositive && MinSignBits >= 24) Mode = MULU8; // When ranges are from -32768 ~ 32767, use MULS16 mode. else if (MinSignBits >= 17) Mode = MULS16; // When ranges are from 0 ~ 65535, use MULU16 mode. else if (AllPositive && MinSignBits >= 16) Mode = MULU16; else return false; return true; } /// When the operands of vector mul are extended from smaller size values, /// like i8 and i16, the type of mul may be shrinked to generate more /// efficient code. Two typical patterns are handled: /// Pattern1: /// %2 = sext/zext %1 to /// %4 = sext/zext %3 to // or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) /// %5 = mul %2, %4 /// /// Pattern2: /// %2 = zext/sext %1 to /// %4 = zext/sext %3 to /// or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) /// %5 = mul %2, %4 /// /// There are four mul shrinking modes: /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is /// -128 to 128, and the scalar value range of %4 is also -128 to 128, /// generate pmullw+sext32 for it (MULS8 mode). /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is /// 0 to 255, and the scalar value range of %4 is also 0 to 255, /// generate pmullw+zext32 for it (MULU8 mode). /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767, /// generate pmullw+pmulhw for it (MULS16 mode). /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535, /// generate pmullw+pmulhuw for it (MULU16 mode). static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Check for legality // pmullw/pmulhw are not supported by SSE. if (!Subtarget.hasSSE2()) return SDValue(); // Check for profitability // pmulld is supported since SSE41. It is better to use pmulld // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than // the expansion. bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize(); if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow())) return SDValue(); ShrinkMode Mode; if (!canReduceVMulWidth(N, DAG, Mode)) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getOperand(0).getValueType(); unsigned NumElts = VT.getVectorNumElements(); if ((NumElts % 2) != 0) return SDValue(); unsigned RegSize = 128; MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); // Shrink the operands of mul. SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); if (NumElts >= OpsVT.getVectorNumElements()) { // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the // lower part is needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); if (Mode == MULU8 || Mode == MULS8) { return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT, MulLo); } else { MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, // the higher part is also needed. SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, ReducedVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider // result. // Generate shuffle functioning as punpcklwd. SmallVector ShuffleMask(NumElts); for (unsigned i = 0, e = NumElts / 2; i < e; i++) { ShuffleMask[2 * i] = i; ShuffleMask[2 * i + 1] = i + NumElts; } SDValue ResLo = DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); ResLo = DAG.getBitcast(ResVT, ResLo); // Generate shuffle functioning as punpckhwd. for (unsigned i = 0, e = NumElts / 2; i < e; i++) { ShuffleMask[2 * i] = i + NumElts / 2; ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; } SDValue ResHi = DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); ResHi = DAG.getBitcast(ResVT, ResHi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); } } else { // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want // to legalize the mul explicitly because implicit legalization for type // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack // instructions which will not exist when we explicitly legalize it by // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with // <4 x i16> undef). // // Legalize the operands of mul. // FIXME: We may be able to handle non-concatenated vectors by insertion. unsigned ReducedSizeInBits = ReducedVT.getSizeInBits(); if ((RegSize % ReducedSizeInBits) != 0) return SDValue(); SmallVector Ops(RegSize / ReducedSizeInBits, DAG.getUNDEF(ReducedVT)); Ops[0] = NewN0; NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); Ops[0] = NewN1; NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); if (Mode == MULU8 || Mode == MULS8) { // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower // part is needed. SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); // convert the type of mul result to VT. MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG : ISD::SIGN_EXTEND_VECTOR_INREG, DL, ResVT, Mul); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } else { // Generate the lower and higher part of mul: pmulhw/pmulhuw. For // MULU16/MULS16, both parts are needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, OpsVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider // result. Make sure the type of mul result is VT. MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi); Res = DAG.getBitcast(ResVT, Res); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } } } static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, SDLoc DL) { auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) { SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(Mult, DL, VT)); Result = DAG.getNode(ISD::SHL, DL, VT, Result, DAG.getConstant(Shift, DL, MVT::i8)); Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, N->getOperand(0)); return Result; }; auto combineMulMulAddOrSub = [&](bool isAdd) { SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(9, DL, VT)); Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT)); Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, N->getOperand(0)); return Result; }; switch (MulAmt) { default: break; case 11: // mul x, 11 => add ((shl (mul x, 5), 1), x) return combineMulShlAddOrSub(5, 1, /*isAdd*/ true); case 21: // mul x, 21 => add ((shl (mul x, 5), 2), x) return combineMulShlAddOrSub(5, 2, /*isAdd*/ true); case 22: // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x) return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), combineMulShlAddOrSub(5, 2, /*isAdd*/ true)); case 19: // mul x, 19 => sub ((shl (mul x, 5), 2), x) return combineMulShlAddOrSub(5, 2, /*isAdd*/ false); case 13: // mul x, 13 => add ((shl (mul x, 3), 2), x) return combineMulShlAddOrSub(3, 2, /*isAdd*/ true); case 23: // mul x, 13 => sub ((shl (mul x, 3), 3), x) return combineMulShlAddOrSub(3, 3, /*isAdd*/ false); case 14: // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x) return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), combineMulShlAddOrSub(3, 2, /*isAdd*/ true)); case 26: // mul x, 26 => sub ((mul (mul x, 9), 3), x) return combineMulMulAddOrSub(/*isAdd*/ false); case 28: // mul x, 28 => add ((mul (mul x, 9), 3), x) return combineMulMulAddOrSub(/*isAdd*/ true); case 29: // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x) return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), combineMulMulAddOrSub(/*isAdd*/ true)); case 30: // mul x, 30 => sub (sub ((shl x, 5), x), x) return DAG.getNode( ISD::SUB, DL, VT, DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(5, DL, MVT::i8)), N->getOperand(0)), N->getOperand(0)); } return SDValue(); } /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); if (!MulConstantOptimization) return SDValue(); // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); if (VT != MVT::i64 && VT != MVT::i32) return SDValue(); ConstantSDNode *C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); uint64_t MulAmt = C->getZExtValue(); if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) return SDValue(); uint64_t MulAmt1 = 0; uint64_t MulAmt2 = 0; if ((MulAmt % 9) == 0) { MulAmt1 = 9; MulAmt2 = MulAmt / 9; } else if ((MulAmt % 5) == 0) { MulAmt1 = 5; MulAmt2 = MulAmt / 5; } else if ((MulAmt % 3) == 0) { MulAmt1 = 3; MulAmt2 = MulAmt / 3; } SDLoc DL(N); SDValue NewMul; if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) // If second multiplifer is pow2, issue it first. We want the multiply by // 3, 5, or 9 to be folded into the addressing mode unless the lone use // is an add. std::swap(MulAmt1, MulAmt2); if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(MulAmt1, DL, VT)); if (isPowerOf2_64(MulAmt2)) NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); } else if (!Subtarget.slowLEA()) NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL); if (!NewMul) { assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && "Both cases that could cause potential overflows should have " "already been handled."); int64_t SignMulAmt = C->getSExtValue(); if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) && (SignMulAmt != -INT64_MAX)) { int NumSign = SignMulAmt > 0 ? 1 : -1; bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1); bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1); if (IsPowerOf2_64PlusOne) { // (mul x, 2^N + 1) => (add (shl x, N), x) NewMul = DAG.getNode( ISD::ADD, DL, VT, N->getOperand(0), DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL, MVT::i8))); } else if (IsPowerOf2_64MinusOne) { // (mul x, 2^N - 1) => (sub (shl x, N), x) NewMul = DAG.getNode( ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL, MVT::i8)), N->getOperand(0)); } // To negate, subtract the number from zero if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1) NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); } } if (NewMul) // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, NewMul, false); return SDValue(); } static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. if (VT.isInteger() && !VT.isVector() && N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask <<= N1C->getAPIntValue(); bool MaskOK = false; // We can handle cases concerning bit-widening nodes containing setcc_c if // we carefully interrogate the mask to make sure we are semantics // preserving. // The transform is not safe if the result of C1 << C2 exceeds the bitwidth // of the underlying setcc_c operation if the setcc_c was zero extended. // Consider the following example: // zext(setcc_c) -> i32 0x0000FFFF // c1 -> i32 0x0000FFFF // c2 -> i32 0x00000001 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE if (N00.getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = true; } else if (N00.getOpcode() == ISD::SIGN_EXTEND && N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = true; } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || N00.getOpcode() == ISD::ANY_EXTEND) && N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); } if (MaskOK && Mask != 0) { SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); } } // Hardware support for vector shifts is sparse which makes us scalarize the // vector operations in many cases. Also, on sandybridge ADD is faster than // shl. // (shl V, 1) -> add V,V if (auto *N1BV = dyn_cast(N1)) if (auto *N1SplatC = N1BV->getConstantSplatNode()) { assert(N0.getValueType().isVector() && "Invalid vector shift type"); // We shift all of the values by one. In many cases we do not have // hardware support for this operation. This is better expressed as an ADD // of two values. if (N1SplatC->getAPIntValue() == 1) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } return SDValue(); } static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned Size = VT.getSizeInBits(); // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) // depending on sign of (SarConst - [56,48,32,24,16]) // sexts in X86 are MOVs. The MOVs have the same code size // as above SHIFTs (only SHIFT on 1 has lower code size). // However the MOVs have 2 advantages to a SHIFT: // 1. MOVs can write to a register that differs from source // 2. MOVs accept memory operands if (VT.isVector() || N1.getOpcode() != ISD::Constant || N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || N0.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); APInt ShlConst = (cast(N01))->getAPIntValue(); APInt SarConst = (cast(N1))->getAPIntValue(); EVT CVT = N1.getValueType(); if (SarConst.isNegative()) return SDValue(); for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) { unsigned ShiftSize = SVT.getSizeInBits(); // skipping types without corresponding sext/zext and // ShlConst that is not one of [56,48,32,24,16] if (ShiftSize >= Size || ShlConst != Size - ShiftSize) continue; SDLoc DL(N); SDValue NN = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); SarConst = SarConst - (Size - ShiftSize); if (SarConst == 0) return NN; else if (SarConst.isNegative()) return DAG.getNode(ISD::SHL, DL, VT, NN, DAG.getConstant(-SarConst, DL, CVT)); else return DAG.getNode(ISD::SRA, DL, VT, NN, DAG.getConstant(SarConst, DL, CVT)); } return SDValue(); } static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); // Try to improve a sequence of srl (and X, C1), C2 by inverting the order. // TODO: This is a generic DAG combine that became an x86-only combine to // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and // and-not ('andn'). if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) return SDValue(); auto *ShiftC = dyn_cast(N1); auto *AndC = dyn_cast(N0.getOperand(1)); if (!ShiftC || !AndC) return SDValue(); // If we can shrink the constant mask below 8-bits or 32-bits, then this // transform should reduce code size. It may also enable secondary transforms // from improved known-bits analysis or instruction selection. APInt MaskVal = AndC->getAPIntValue(); APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue()); unsigned OldMaskSize = MaskVal.getMinSignedBits(); unsigned NewMaskSize = NewMaskVal.getMinSignedBits(); if ((OldMaskSize > 8 && NewMaskSize <= 8) || (OldMaskSize > 32 && NewMaskSize <= 32)) { // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC) SDLoc DL(N); SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT); SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1); return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask); } return SDValue(); } static SDValue combineShift(SDNode* N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (N->getOpcode() == ISD::SHL) if (SDValue V = combineShiftLeft(N, DAG)) return V; if (N->getOpcode() == ISD::SRA) if (SDValue V = combineShiftRightArithmetic(N, DAG)) return V; if (N->getOpcode() == ISD::SRL) if (SDValue V = combineShiftRightLogical(N, DAG)) return V; return SDValue(); } static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && "Unexpected shift opcode"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned DstBitsPerElt = VT.getScalarSizeInBits(); unsigned SrcBitsPerElt = 2 * DstBitsPerElt; assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"); // Constant Folding. APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) && (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) && getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumDstElts = VT.getVectorNumElements(); unsigned NumSrcElts = NumDstElts / 2; unsigned NumDstEltsPerLane = NumDstElts / NumLanes; unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; bool IsSigned = (X86ISD::PACKSS == Opcode); APInt Undefs(NumDstElts, 0); SmallVector Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt)); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0); auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0); if (UndefElts[SrcIdx]) { Undefs.setBit(Lane * NumDstEltsPerLane + Elt); continue; } APInt &Val = EltBits[SrcIdx]; if (IsSigned) { // PACKSS: Truncate signed value with signed saturation. // Source values less than dst minint are saturated to minint. // Source values greater than dst maxint are saturated to maxint. if (Val.isSignedIntN(DstBitsPerElt)) Val = Val.trunc(DstBitsPerElt); else if (Val.isNegative()) Val = APInt::getSignedMinValue(DstBitsPerElt); else Val = APInt::getSignedMaxValue(DstBitsPerElt); } else { // PACKUS: Truncate signed value with unsigned saturation. // Source values less than zero are saturated to zero. // Source values greater than dst maxuint are saturated to maxuint. if (Val.isIntN(DstBitsPerElt)) Val = Val.trunc(DstBitsPerElt); else if (Val.isNegative()) Val = APInt::getNullValue(DstBitsPerElt); else Val = APInt::getAllOnesValue(DstBitsPerElt); } Bits[Lane * NumDstEltsPerLane + Elt] = Val; } } return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N)); } // Attempt to combine as shuffle. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) { DCI.CombineTo(N, Res); return SDValue(); } return SDValue(); } static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"); bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type"); // Out of range logical bit shifts are guaranteed to be zero. // Out of range arithmetic bit shifts splat the sign bit. APInt ShiftVal = cast(N1)->getAPIntValue(); if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) { if (LogicalShift) return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); else ShiftVal = NumBitsPerElt - 1; } // Shift N0 by zero -> N0. if (!ShiftVal) return N0; // Shift zero -> zero. if (ISD::isBuildVectorAllZeros(N0.getNode())) return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31). // This VSRLI only looks at the sign bit, which is unmodified by VSRAI. // TODO - support other sra opcodes as needed. if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt && N0.getOpcode() == X86ISD::VSRAI) return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1); // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI && N1 == N0.getOperand(1)) { SDValue N00 = N0.getOperand(0); unsigned NumSignBits = DAG.ComputeNumSignBits(N00); if (ShiftVal.ult(NumSignBits)) return N00; } // We can decode 'whole byte' logical bit shifts as shuffles. if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) { DCI.CombineTo(N, Res); return SDValue(); } } // Constant Folding. APInt UndefElts; SmallVector EltBits; if (N->isOnlyUserOf(N0.getNode()) && getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) { assert(EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"); unsigned ShiftImm = ShiftVal.getZExtValue(); for (APInt &Elt : EltBits) { if (X86ISD::VSHLI == Opcode) Elt <<= ShiftImm; else if (X86ISD::VSRAI == Opcode) Elt.ashrInPlace(ShiftImm); else Elt.lshrInPlace(ShiftImm); } return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); } return SDValue(); } static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert( ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && N->getValueType(0) == MVT::v8i16)) && "Unexpected vector insertion"); // Attempt to combine PINSRB/PINSRW patterns to a shuffle. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) { DCI.CombineTo(N, Res); return SDValue(); } return SDValue(); } /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for /// OR -> CMPNEQSS. static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned opcode; // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); SDValue CMP1 = N1->getOperand(1); SDLoc DL(N); // The SETCCs should both refer to the same CMP. if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) return SDValue(); SDValue CMP00 = CMP0->getOperand(0); SDValue CMP01 = CMP0->getOperand(1); EVT VT = CMP00.getValueType(); if (VT == MVT::f32 || VT == MVT::f64) { bool ExpectingFlags = false; // Check for any users that want flags: for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); !ExpectingFlags && UI != UE; ++UI) switch (UI->getOpcode()) { default: case ISD::BR_CC: case ISD::BRCOND: case ISD::SELECT: ExpectingFlags = true; break; case ISD::CopyToReg: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: break; } if (!ExpectingFlags) { enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { X86::CondCode tmp = cc0; cc0 = cc1; cc1 = tmp; } if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { // FIXME: need symbolic constants for these magic numbers. // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget.hasAVX512()) { SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getSimpleValueType(0), FSetCC, DAG.getIntPtrConstant(0, DL)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); bool is64BitFP = (CMP00.getValueType() == MVT::f64); MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; if (is64BitFP && !Subtarget.is64Bit()) { // On a 32-bit target, we cannot bitcast the 64-bit float to a // 64-bit integer, since that's not a legal type. Since // OnesOrZeroesF is all ones of all zeroes, we don't need all the // bits, but can do this little dance to extract the lowest 32 bits // and work with those going forward. SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, OnesOrZeroesF); SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64); OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32, DAG.getIntPtrConstant(0, DL)); IntVT = MVT::i32; } SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, DAG.getConstant(1, DL, IntVT)); SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); return OneBitOfTruth; } } } } return SDValue(); } /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64) return SDValue(); if (N0.getOpcode() == ISD::XOR && ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); if (N1.getOpcode() == ISD::XOR && ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); return SDValue(); } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized // register. In most cases we actually compare or select YMM-sized registers // and mixing the two types creates horrible code. This method optimizes // some of the transition sequences. // Even with AVX-512 this is still useful for removing casts around logical // operations on vXi1 mask types. static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); assert(VT.isVector() && "Expected vector type"); assert((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); SDValue Narrow = N->getOperand(0); EVT NarrowVT = Narrow.getValueType(); if (Narrow->getOpcode() != ISD::XOR && Narrow->getOpcode() != ISD::AND && Narrow->getOpcode() != ISD::OR) return SDValue(); SDValue N0 = Narrow->getOperand(0); SDValue N1 = Narrow->getOperand(1); SDLoc DL(Narrow); // The Left side has to be a trunc. if (N0.getOpcode() != ISD::TRUNCATE) return SDValue(); // The type of the truncated inputs. if (N0->getOperand(0).getValueType() != VT) return SDValue(); // The right side has to be a 'trunc' or a constant vector. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getValueType() == VT; - ConstantSDNode *RHSConstSplat = nullptr; - if (auto *RHSBV = dyn_cast(N1)) - RHSConstSplat = RHSBV->getConstantSplatNode(); - if (!RHSTrunc && !RHSConstSplat) + if (!RHSTrunc && + !ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT)) return SDValue(); // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); - if (RHSConstSplat) { - N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT.getVectorElementType(), - SDValue(RHSConstSplat, 0)); - N1 = DAG.getSplatBuildVector(VT, DL, N1); - } else if (RHSTrunc) { + if (RHSTrunc) N1 = N1->getOperand(0); - } + else + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); // Generate the wide operation. SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1); unsigned Opcode = N->getOpcode(); switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType()); case ISD::SIGN_EXTEND: return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(NarrowVT)); } } /// If both input operands of a logic op are being cast from floating point /// types, try to convert this into a floating point logic node to avoid /// unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned FPOpcode = ISD::DELETED_NODE; if (N->getOpcode() == ISD::AND) FPOpcode = X86ISD::FAND; else if (N->getOpcode() == ISD::OR) FPOpcode = X86ISD::FOR; else if (N->getOpcode() == ISD::XOR) FPOpcode = X86ISD::FXOR; assert(FPOpcode != ISD::DELETED_NODE && "Unexpected input node for FP logic conversion"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && ((Subtarget.hasSSE1() && VT == MVT::i32) || (Subtarget.hasSSE2() && VT == MVT::i64))) { SDValue N00 = N0.getOperand(0); SDValue N10 = N1.getOperand(0); EVT N00Type = N00.getValueType(); EVT N10Type = N10.getValueType(); if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); return DAG.getBitcast(VT, FPLogic); } } return SDValue(); } /// If this is a zero/all-bits result that is bitwise-anded with a low bits /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and' /// with a shift-right to eliminate loading the vector constant mask value. static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); EVT VT0 = Op0.getValueType(); EVT VT1 = Op1.getValueType(); if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger()) return SDValue(); APInt SplatVal; if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || !SplatVal.isMask()) return SDValue(); if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL)) return SDValue(); unsigned EltBitWidth = VT0.getScalarSizeInBits(); if (EltBitWidth != DAG.ComputeNumSignBits(Op0)) return SDValue(); SDLoc DL(N); unsigned ShiftVal = SplatVal.countTrailingOnes(); SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8); SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); return DAG.getBitcast(N->getValueType(0), Shift); } // Get the index node from the lowered DAG of a GEP IR instruction with one // indexing dimension. static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) { if (Ld->isIndexed()) return SDValue(); SDValue Base = Ld->getBasePtr(); if (Base.getOpcode() != ISD::ADD) return SDValue(); SDValue ShiftedIndex = Base.getOperand(0); if (ShiftedIndex.getOpcode() != ISD::SHL) return SDValue(); return ShiftedIndex.getOperand(0); } static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) { if (Subtarget.hasBMI2() && VT.isScalarInteger()) { switch (VT.getSizeInBits()) { default: return false; case 64: return Subtarget.is64Bit() ? true : false; case 32: return true; } } return false; } // This function recognizes cases where X86 bzhi instruction can replace and // 'and-load' sequence. // In case of loading integer value from an array of constants which is defined // as follows: // // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1} // // then applying a bitwise and on the result with another input. // It's equivalent to performing bzhi (zero high bits) on the input, with the // same index of the load. static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Node->getSimpleValueType(0); SDLoc dl(Node); // Check if subtarget has BZHI instruction for the node's type if (!hasBZHI(Subtarget, VT)) return SDValue(); // Try matching the pattern for both operands. for (unsigned i = 0; i < 2; i++) { SDValue N = Node->getOperand(i); LoadSDNode *Ld = dyn_cast(N.getNode()); // continue if the operand is not a load instruction if (!Ld) return SDValue(); const Value *MemOp = Ld->getMemOperand()->getValue(); if (!MemOp) return SDValue(); if (const GetElementPtrInst *GEP = dyn_cast(MemOp)) { if (GlobalVariable *GV = dyn_cast(GEP->getOperand(0))) { if (GV->isConstant() && GV->hasDefinitiveInitializer()) { Constant *Init = GV->getInitializer(); Type *Ty = Init->getType(); if (!isa(Init) || !Ty->getArrayElementType()->isIntegerTy() || Ty->getArrayElementType()->getScalarSizeInBits() != VT.getSizeInBits() || Ty->getArrayNumElements() > Ty->getArrayElementType()->getScalarSizeInBits()) continue; // Check if the array's constant elements are suitable to our case. uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); bool ConstantsMatch = true; for (uint64_t j = 0; j < ArrayElementCount; j++) { ConstantInt *Elem = dyn_cast(Init->getAggregateElement(j)); if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) { ConstantsMatch = false; break; } } if (!ConstantsMatch) continue; // Do the transformation (For 32-bit type): // -> (and (load arr[idx]), inp) // <- (and (srl 0xFFFFFFFF, (sub 32, idx))) // that will be replaced with one bzhi instruction. SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0); SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT); // Get the Node which indexes into the array. SDValue Index = getIndexFromUnindexedLoad(Ld); if (!Index) return SDValue(); Index = DAG.getZExtOrTrunc(Index, dl, VT); SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index); SDValue AllOnes = DAG.getAllOnesConstant(dl, VT); SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub); return DAG.getNode(ISD::AND, dl, VT, Inp, LShr); } } } } return SDValue(); } static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); // If this is SSE1 only convert to FAND to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast( MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32, DAG.getBitcast(MVT::v4f32, N->getOperand(0)), DAG.getBitcast(MVT::v4f32, N->getOperand(1)))); } if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) return R; if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) return R; // Attempt to recursively combine a bitmask AND with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) { DCI.CombineTo(N, Res); return SDValue(); } } // Attempt to combine a scalar bitmask AND with an extracted shuffle. if ((VT.getScalarSizeInBits() % 8) == 0 && N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && isa(N->getOperand(0).getOperand(1))) { SDValue BitMask = N->getOperand(1); SDValue SrcVec = N->getOperand(0).getOperand(0); EVT SrcVecVT = SrcVec.getValueType(); // Check that the constant bitmask masks whole bytes. APInt UndefElts; SmallVector EltBits; if (VT == SrcVecVT.getScalarType() && N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) && getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) && llvm::all_of(EltBits, [](APInt M) { return M.isNullValue() || M.isAllOnesValue(); })) { unsigned NumElts = SrcVecVT.getVectorNumElements(); unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8; unsigned Idx = N->getOperand(0).getConstantOperandVal(1); // Create a root shuffle mask from the byte mask and the extracted index. SmallVector ShuffleMask(NumElts * Scale, SM_SentinelUndef); for (unsigned i = 0; i != Scale; ++i) { if (UndefElts[i]) continue; int VecIdx = Scale * Idx + i; ShuffleMask[VecIdx] = EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx; } if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, N->getOperand(0).getOperand(1)); } } return SDValue(); } // Try to fold: // (or (and (m, y), (pandn m, x))) // into: // (vselect m, x, y) // As a special case, try to fold: // (or (and (m, (sub 0, x)), (pandn m, x))) // into: // (sub (xor X, M), M) static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || (VT.is256BitVector() && Subtarget.hasInt256()))) return SDValue(); // Canonicalize AND to LHS. if (N1.getOpcode() == ISD::AND) std::swap(N0, N1); // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for // ANDNP combine allows other combines to happen that prevent matching. if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) return SDValue(); SDValue Mask = N1.getOperand(0); SDValue X = N1.getOperand(1); SDValue Y; if (N0.getOperand(0) == Mask) Y = N0.getOperand(1); if (N0.getOperand(1) == Mask) Y = N0.getOperand(0); // Check to see if the mask appeared in both the AND and ANDNP. if (!Y.getNode()) return SDValue(); // Validate that X, Y, and Mask are bitcasts, and see through them. Mask = peekThroughBitcasts(Mask); X = peekThroughBitcasts(X); Y = peekThroughBitcasts(Y); EVT MaskVT = Mask.getValueType(); unsigned EltBits = MaskVT.getScalarSizeInBits(); // TODO: Attempt to handle floating point cases as well? if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits) return SDValue(); SDLoc DL(N); // Try to match: // (or (and (M, (sub 0, X)), (pandn M, X))) // which is a special case of vselect: // (vselect M, (sub 0, X), X) // Per: // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate // We know that, if fNegate is 0 or 1: // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) // // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) // ( M ? -X : X) == ((X ^ M ) + (M & 1)) // This lets us transform our vselect to: // (add (xor X, M), (and M, 1)) // And further to: // (sub (xor X, M), M) if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT && DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) { auto IsNegV = [](SDNode *N, SDValue V) { return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); }; SDValue V; if (IsNegV(Y.getNode(), X)) V = X; else if (IsNegV(X.getNode(), Y)) V = Y; if (V) { SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); SDValue SubOp2 = Mask; // If the negate was on the false side of the select, then // the operands of the SUB need to be swapped. PR 27251. // This is because the pattern being matched above is // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) // but if the pattern matched was // (vselect M, X, (sub (0, X))), that is really negation of the pattern // above, -(vselect M, (sub 0, X), X), and therefore the replacement // pattern also needs to be a negation of the replacement pattern above. // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the // sub accomplishes the negation of the replacement pattern. if (V == Y) std::swap(SubOp1, SubOp2); SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); return DAG.getBitcast(VT, Res); } } // PBLENDVB is only available on SSE 4.1. if (!Subtarget.hasSSE41()) return SDValue(); MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); Mask = DAG.getBitcast(BlendVT, Mask); Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X); return DAG.getBitcast(VT, Mask); } // Helper function for combineOrCmpEqZeroToCtlzSrl // Transforms: // seteq(cmp x, 0) // into: // srl(ctlz x), log2(bitsize(x)) // Input pattern is checked by caller. static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy, SelectionDAG &DAG) { SDValue Cmp = Op.getOperand(1); EVT VT = Cmp.getOperand(0).getValueType(); unsigned Log2b = Log2_32(VT.getSizeInBits()); SDLoc dl(Op); SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0)); // The result of the shift is true or false, and on X86, the 32-bit // encoding of shr and lzcnt is more desirable. SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32); SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc, DAG.getConstant(Log2b, dl, VT)); return DAG.getZExtOrTrunc(Scc, dl, ExtTy); } // Try to transform: // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0)))) // into: // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x)) // Will also attempt to match more generic cases, eg: // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0))) // Only applies if the target supports the FastLZCNT feature. static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast()) return SDValue(); auto isORCandidate = [](SDValue N) { return (N->getOpcode() == ISD::OR && N->hasOneUse()); }; // Check the zero extend is extending to 32-bit or more. The code generated by // srl(ctlz) for 16-bit or less variants of the pattern would require extra // instructions to clear the upper bits. if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) || !isORCandidate(N->getOperand(0))) return SDValue(); // Check the node matches: setcc(eq, cmp 0) auto isSetCCCandidate = [](SDValue N) { return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() && X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E && N->getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(N->getOperand(1).getOperand(1)) && N->getOperand(1).getValueType().bitsGE(MVT::i32); }; SDNode *OR = N->getOperand(0).getNode(); SDValue LHS = OR->getOperand(0); SDValue RHS = OR->getOperand(1); // Save nodes matching or(or, setcc(eq, cmp 0)). SmallVector ORNodes; while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) || (isORCandidate(RHS) && isSetCCCandidate(LHS)))) { ORNodes.push_back(OR); OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode(); LHS = OR->getOperand(0); RHS = OR->getOperand(1); } // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)). if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) || !isORCandidate(SDValue(OR, 0))) return SDValue(); // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it // to // or(srl(ctlz),srl(ctlz)). // The dag combiner can then fold it into: // srl(or(ctlz, ctlz)). EVT VT = OR->getValueType(0); SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG); SDValue Ret, NewRHS; if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG))) Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS); if (!Ret) return SDValue(); // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern. while (ORNodes.size() > 0) { OR = ORNodes.pop_back_val(); LHS = OR->getOperand(0); RHS = OR->getOperand(1); // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). if (RHS->getOpcode() == ISD::OR) std::swap(LHS, RHS); EVT VT = OR->getValueType(0); SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); if (!NewRHS) return SDValue(); Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS); } if (Ret) Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); return Ret; } static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // If this is SSE1 only convert to FOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast(MVT::v4i32, DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32, DAG.getBitcast(MVT::v4f32, N0), DAG.getBitcast(MVT::v4f32, N1))); } if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions // have higher latencies and we are not optimizing for size. if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) return SDValue(); if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); SDValue ShAmt0 = N0.getOperand(1); if (ShAmt0.getValueType() != MVT::i8) return SDValue(); SDValue ShAmt1 = N1.getOperand(1); if (ShAmt1.getValueType() != MVT::i8) return SDValue(); if (ShAmt0.getOpcode() == ISD::TRUNCATE) ShAmt0 = ShAmt0.getOperand(0); if (ShAmt1.getOpcode() == ISD::TRUNCATE) ShAmt1 = ShAmt1.getOperand(0); SDLoc DL(N); unsigned Opc = X86ISD::SHLD; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) { Opc = X86ISD::SHRD; std::swap(Op0, Op1); std::swap(ShAmt0, ShAmt1); } // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C ) // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C ) // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C ) // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C ) unsigned Bits = VT.getSizeInBits(); if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); if (ConstantSDNode *SumC = dyn_cast(Sum)) { SDValue ShAmt1Op1 = ShAmt1.getOperand(1); if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE) ShAmt1Op1 = ShAmt1Op1.getOperand(0); if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) return DAG.getNode(Opc, DL, VT, Op0, Op1, DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } } else if (ConstantSDNode *ShAmt1C = dyn_cast(ShAmt1)) { ConstantSDNode *ShAmt0C = dyn_cast(ShAmt0); if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits) return DAG.getNode(Opc, DL, VT, N0.getOperand(0), N1.getOperand(0), DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } else if (ShAmt1.getOpcode() == ISD::XOR) { SDValue Mask = ShAmt1.getOperand(1); if (ConstantSDNode *MaskC = dyn_cast(Mask)) { unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL); SDValue ShAmt1Op0 = ShAmt1.getOperand(0); if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE) ShAmt1Op0 = ShAmt1Op0.getOperand(0); if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) { if (Op1.getOpcode() == InnerShift && isa(Op1.getOperand(1)) && Op1.getConstantOperandVal(1) == 1) { return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD && Op1.getOperand(0) == Op1.getOperand(1)) { return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } } } } return SDValue(); } /// Try to turn tests against the signbit in the form of: /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) /// into: /// SETGT(X, -1) static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { // This is only worth doing if the output type is i8 or i1. EVT ResultType = N->getValueType(0); if (ResultType != MVT::i8 && ResultType != MVT::i1) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // We should be performing an xor against a truncated shift. if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) return SDValue(); // Make sure we are performing an xor against one. if (!isOneConstant(N1)) return SDValue(); // SetCC on x86 zero extends so only act on this if it's a logical shift. SDValue Shift = N0.getOperand(0); if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) return SDValue(); // Make sure we are truncating from one of i16, i32 or i64. EVT ShiftTy = Shift.getValueType(); if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) return SDValue(); // Make sure the shift amount extracts the sign bit. if (!isa(Shift.getOperand(1)) || Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) return SDValue(); // Create a greater-than comparison against -1. // N.B. Using SETGE against 0 works but we want a canonical looking // comparison, using SETGT matches up with what TranslateX86CC. SDLoc DL(N); SDValue ShiftOp = Shift.getOperand(0); EVT ShiftOpTy = ShiftOp.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ResultType); SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp, DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); if (SetCCResultType != ResultType) Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond); return Cond; } /// Turn vector tests of the signbit in the form of: /// xor (sra X, elt_size(X)-1), -1 /// into: /// pcmpgt X, -1 /// /// This should be called before type legalization because the pattern may not /// persist after that. static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (!VT.isSimple()) return SDValue(); switch (VT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break; case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break; case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break; } // There must be a shift right algebraic before the xor, and the xor must be a // 'not' operation. SDValue Shift = N->getOperand(0); SDValue Ones = N->getOperand(1); if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() || !ISD::isBuildVectorAllOnes(Ones.getNode())) return SDValue(); // The shift should be smearing the sign bit across each vector element. auto *ShiftBV = dyn_cast(Shift.getOperand(1)); if (!ShiftBV) return SDValue(); EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); auto *ShiftAmt = ShiftBV->getConstantSplatNode(); if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) return SDValue(); // Create a greater-than comparison against -1. We don't use the more obvious // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction. return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); } /// Check if truncation with saturation form type \p SrcVT to \p DstVT /// is valid for the given \p Subtarget. static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX512()) return false; // FIXME: Scalar type may be supported if we move it to vector register. if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512) return false; EVT SrcElVT = SrcVT.getScalarType(); EVT DstElVT = DstVT.getScalarType(); if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64) return false; if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32) return false; if (SrcVT.is512BitVector() || Subtarget.hasVLX()) return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI(); return false; } /// Detect a pattern of truncation with saturation: /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). /// Return the source value to be truncated or SDValue() if the pattern was not /// matched. static SDValue detectUSatPattern(SDValue In, EVT VT) { if (In.getOpcode() != ISD::UMIN) return SDValue(); //Saturation with truncation. We truncate from InVT to VT. assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && "Unexpected types for truncate operation"); APInt C; if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) { // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according // the element size of the destination type. return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : SDValue(); } return SDValue(); } /// Detect a pattern of truncation with saturation: /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). /// The types should allow to use VPMOVUS* instruction on AVX512. /// Return the source value to be truncated or SDValue() if the pattern was not /// matched. static SDValue detectAVX512USatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) { if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) return SDValue(); return detectUSatPattern(In, VT); } static SDValue combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT)) return SDValue(); if (auto USatVal = detectUSatPattern(In, VT)) if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); return SDValue(); } /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient /// X86ISD::AVG instruction. static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { if (!VT.isVector() || !VT.isSimple()) return SDValue(); EVT InVT = In.getValueType(); unsigned NumElems = VT.getVectorNumElements(); EVT ScalarVT = VT.getVectorElementType(); if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && isPowerOf2_32(NumElems))) return SDValue(); // InScalarVT is the intermediate type in AVG pattern and it should be greater // than the original input type (i8/i16). EVT InScalarVT = InVT.getVectorElementType(); if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) return SDValue(); if (!Subtarget.hasSSE2()) return SDValue(); // Detect the following pattern: // // %1 = zext %a to // %2 = zext %b to // %3 = add nuw nsw %1, // %4 = add nuw nsw %3, %2 // %5 = lshr %N, // %6 = trunc %5 to // // In AVX512, the last instruction can also be a trunc store. if (In.getOpcode() != ISD::SRL) return SDValue(); // A lambda checking the given SDValue is a constant vector and each element // is in the range [Min, Max]. auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { BuildVectorSDNode *BV = dyn_cast(V); if (!BV || !BV->isConstant()) return false; for (SDValue Op : V->ops()) { ConstantSDNode *C = dyn_cast(Op); if (!C) return false; uint64_t Val = C->getZExtValue(); if (Val < Min || Val > Max) return false; } return true; }; // Split vectors to legal target size and apply AVG. auto LowerToAVG = [&](SDValue Op0, SDValue Op1) { unsigned NumSubs = 1; if (Subtarget.hasBWI()) { if (VT.getSizeInBits() > 512) NumSubs = VT.getSizeInBits() / 512; } else if (Subtarget.hasAVX2()) { if (VT.getSizeInBits() > 256) NumSubs = VT.getSizeInBits() / 256; } else { if (VT.getSizeInBits() > 128) NumSubs = VT.getSizeInBits() / 128; } if (NumSubs == 1) return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1); SmallVector Subs; EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), VT.getVectorNumElements() / NumSubs); for (unsigned i = 0; i != NumSubs; ++i) { unsigned Idx = i * SubVT.getVectorNumElements(); SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits()); SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits()); Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); }; // Check if each element of the vector is left-shifted by one. auto LHS = In.getOperand(0); auto RHS = In.getOperand(1); if (!IsConstVectorInRange(RHS, 1, 1)) return SDValue(); if (LHS.getOpcode() != ISD::ADD) return SDValue(); // Detect a pattern of a + b + 1 where the order doesn't matter. SDValue Operands[3]; Operands[0] = LHS.getOperand(0); Operands[1] = LHS.getOperand(1); // Take care of the case when one of the operands is a constant vector whose // element is in the range [1, 256]. if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && Operands[0].getOpcode() == ISD::ZERO_EXTEND && Operands[0].getOperand(0).getValueType() == VT) { // The pattern is detected. Subtract one from the constant vector, then // demote it and emit X86ISD::AVG instruction. SDValue VecOnes = DAG.getConstant(1, DL, InVT); Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); return LowerToAVG(Operands[0].getOperand(0), Operands[1]); } if (Operands[0].getOpcode() == ISD::ADD) std::swap(Operands[0], Operands[1]); else if (Operands[1].getOpcode() != ISD::ADD) return SDValue(); Operands[2] = Operands[1].getOperand(0); Operands[1] = Operands[1].getOperand(1); // Now we have three operands of two additions. Check that one of them is a // constant vector with ones, and the other two are promoted from i8/i16. for (int i = 0; i < 3; ++i) { if (!IsConstVectorInRange(Operands[i], 1, 1)) continue; std::swap(Operands[i], Operands[2]); // Check if Operands[0] and Operands[1] are results of type promotion. for (int j = 0; j < 2; ++j) if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || Operands[j].getOperand(0).getValueType() != VT) return SDValue(); // The pattern is detected, emit X86ISD::AVG instruction. return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0)); } return SDValue(); } static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { LoadSDNode *Ld = cast(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // For chips with slow 32-byte unaligned loads, break the 32-byte operation // into two 16-byte operations. Also split non-temporal aligned loads on // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. ISD::LoadExtType Ext = Ld->getExtensionType(); bool Fast; unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) || (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, AddressSpace, Alignment, &Fast) && !Fast))) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); SDValue Ptr = Ld->getBasePtr(); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems/2); SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Alignment, Ld->getMemOperand()->getFlags()); Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl); SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), std::min(16U, Alignment), Ld->getMemOperand()->getFlags()); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); SDValue NewVec = DAG.getUNDEF(RegVT); NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl); NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl); return DCI.CombineTo(N, NewVec, TF, true); } return SDValue(); } /// If V is a build vector of boolean constants and exactly one of those /// constants is true, return the operand index of that true element. /// Otherwise, return -1. static int getOneTrueElt(SDValue V) { // This needs to be a build vector of booleans. // TODO: Checking for the i1 type matches the IR definition for the mask, // but the mask check could be loosened to i8 or other types. That might // also require checking more than 'allOnesValue'; eg, the x86 HW // instructions only require that the MSB is set for each mask element. // The ISD::MSTORE comments/definition do not specify how the mask operand // is formatted. auto *BV = dyn_cast(V); if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1) return -1; int TrueIndex = -1; unsigned NumElts = BV->getValueType(0).getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { const SDValue &Op = BV->getOperand(i); if (Op.isUndef()) continue; auto *ConstNode = dyn_cast(Op); if (!ConstNode) return -1; if (ConstNode->getAPIntValue().isAllOnesValue()) { // If we already found a one, this is too many. if (TrueIndex >= 0) return -1; TrueIndex = i; } } return TrueIndex; } /// Given a masked memory load/store operation, return true if it has one mask /// bit set. If it has one mask bit set, then also return the memory address of /// the scalar element to load/store, the vector index to insert/extract that /// scalar element, and the alignment for the scalar memory access. static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, unsigned &Alignment) { int TrueMaskElt = getOneTrueElt(MaskedOp->getMask()); if (TrueMaskElt < 0) return false; // Get the address of the one scalar element that is specified by the mask // using the appropriate offset from the base pointer. EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType(); Addr = MaskedOp->getBasePtr(); if (TrueMaskElt != 0) { unsigned Offset = TrueMaskElt * EltVT.getStoreSize(); Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp)); } Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp)); Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize()); return true; } /// If exactly one element of the mask is set for a non-extending masked load, /// it is a scalar load and vector insert. /// Note: It is expected that the degenerate cases of an all-zeros or all-ones /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; unsigned Alignment; if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment)) return SDValue(); // Load the one scalar element that is specified by the mask using the // appropriate offset from the base pointer. SDLoc DL(ML); EVT VT = ML->getValueType(0); EVT EltVT = VT.getVectorElementType(); SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(), Alignment, ML->getMemOperand()->getFlags()); // Insert the loaded element into the appropriate place in the vector. SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(), Load, VecIndex); return DCI.CombineTo(ML, Insert, Load.getValue(1), true); } static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode())) return SDValue(); SDLoc DL(ML); EVT VT = ML->getValueType(0); // If we are loading the first and last elements of a vector, it is safe and // always faster to load the whole vector. Replace the masked load with a // vector load and select. unsigned NumElts = VT.getVectorNumElements(); BuildVectorSDNode *MaskBV = cast(ML->getMask()); bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0)); bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1)); if (LoadFirstElt && LoadLastElt) { SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(), ML->getMemOperand()); SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0()); return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true); } // Convert a masked load with a constant mask into a masked load and a select. // This allows the select operation to use a faster kind of select instruction // (for example, vblendvps -> vblendps). // Don't try this if the pass-through operand is already undefined. That would // cause an infinite loop because that's what we're about to create. if (ML->getSrc0().isUndef()) return SDValue(); // The new masked load has an undef pass-through operand. The select uses the // original pass-through operand. SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), ML->getMask(), DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(), ML->getExtensionType()); SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0()); return DCI.CombineTo(ML, Blend, NewML.getValue(1), true); } static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MaskedLoadSDNode *Mld = cast(N); // TODO: Expanding load with constant mask may be optimized as well. if (Mld->isExpandingLoad()) return SDValue(); if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI)) return ScalarLoad; // TODO: Do some AVX512 subsets benefit from this transform? if (!Subtarget.hasAVX512()) if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI)) return Blend; } if (Mld->getExtensionType() != ISD::SEXTLOAD) return SDValue(); // Resolve extending loads. EVT VT = Mld->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); EVT LdVT = Mld->getMemoryVT(); SDLoc dl(Mld); assert(LdVT != VT && "Cannot extend to the same type"); unsigned ToSz = VT.getScalarSizeInBits(); unsigned FromSz = LdVT.getScalarSizeInBits(); // From/To sizes and ElemCount must be pow of two. assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load"); unsigned SizeRatio = ToSz / FromSz; assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), LdVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); // Convert Src0 value. SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0()); if (!Mld->getSrc0().isUndef()) { SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal"); WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, DAG.getUNDEF(WideVecVT), ShuffleVec); } // Prepare the new mask. SDValue NewMask; SDValue Mask = Mld->getMask(); if (Mask.getValueType() == VT) { // Mask and original value have the same type. NewMask = DAG.getBitcast(WideVecVT, Mask); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), ShuffleVec); } else { assert(Mask.getValueType().getVectorElementType() == MVT::i1); unsigned WidenNumElts = NumElems*SizeRatio; unsigned MaskNumElts = VT.getVectorNumElements(); EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, WidenNumElts); unsigned NumConcat = WidenNumElts / MaskNumElts; SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); SmallVector Ops(NumConcat, ZeroVal); Ops[0] = Mask; NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), Mld->getBasePtr(), NewMask, WideSrc0, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } /// If exactly one element of the mask is set for a non-truncating masked store, /// it is a vector extract and scalar store. /// Note: It is expected that the degenerate cases of an all-zeros or all-ones /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG) { // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; unsigned Alignment; if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment)) return SDValue(); // Extract the one scalar element that is actually being stored. SDLoc DL(MS); EVT VT = MS->getValue().getValueType(); EVT EltVT = VT.getVectorElementType(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, MS->getValue(), VecIndex); // Store that element at the appropriate offset from the base pointer. return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(), Alignment, MS->getMemOperand()->getFlags()); } static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MaskedStoreSDNode *Mst = cast(N); if (Mst->isCompressingStore()) return SDValue(); if (!Mst->isTruncatingStore()) { if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) return ScalarStore; // If the mask is checking (0 > X), we're creating a vector with all-zeros // or all-ones elements based on the sign bits of X. AVX1 masked store only // cares about the sign bit of each mask element, so eliminate the compare: // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X // Note that by waiting to match an x86-specific PCMPGT node, we're // eliminating potentially more complex matching of a setcc node which has // a full range of predicates. SDValue Mask = Mst->getMask(); if (Mask.getOpcode() == X86ISD::PCMPGT && ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) { assert(Mask.getValueType() == Mask.getOperand(1).getValueType() && "Unexpected type for PCMPGT"); return DAG.getMaskedStore( Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(), Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand()); } // TODO: AVX512 targets should also be able to simplify something like the // pattern above, but that pattern will be different. It will either need to // match setcc more generally or match PCMPGTM later (in tablegen?). return SDValue(); } // Resolve truncating stores. EVT VT = Mst->getValue().getValueType(); unsigned NumElems = VT.getVectorNumElements(); EVT StVT = Mst->getMemoryVT(); SDLoc dl(Mst); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getScalarSizeInBits(); unsigned ToSz = StVT.getScalarSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. // In this case we don't need any further transformations. if (TLI.isTruncStoreLegal(VT, StVT)) return SDValue(); // From/To sizes and ElemCount must be pow of two. assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. assert (((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"); unsigned SizeRatio = FromSz / ToSz; assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), ShuffleVec); SDValue NewMask; SDValue Mask = Mst->getMask(); if (Mask.getValueType() == VT) { // Mask and original value have the same type. NewMask = DAG.getBitcast(WideVecVT, Mask); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) ShuffleVec[i] = NumElems*SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), ShuffleVec); } else { assert(Mask.getValueType().getVectorElementType() == MVT::i1); unsigned WidenNumElts = NumElems*SizeRatio; unsigned MaskNumElts = VT.getVectorNumElements(); EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, WidenNumElts); unsigned NumConcat = WidenNumElts / MaskNumElts; SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); SmallVector Ops(NumConcat, ZeroVal); Ops[0] = Mask; NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), NewMask, StVT, Mst->getMemOperand(), false); } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { StoreSDNode *St = cast(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. bool Fast; unsigned AddressSpace = St->getAddressSpace(); unsigned Alignment = St->getAlignment(); if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl); SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl); SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl); SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(), Alignment, St->getMemOperand()->getFlags()); SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(), std::min(16U, Alignment), St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { // Check if we can detect an AVG pattern from the truncation. If yes, // replace the trunc store by a normal store with the result of X86ISD::AVG // instruction. if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl)) return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget)) return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getScalarSizeInBits(); unsigned ToSz = StVT.getScalarSizeInBits(); // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. // In this case we don't need any further transformations. if (TLI.isTruncStoreLegalOrCustom(VT, StVT)) return SDValue(); // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. if (0 != (NumElems * FromSz) % ToSz) return SDValue(); unsigned SizeRatio = FromSz / ToSz; assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), ShuffleVec); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. // Find the largest store unit MVT StoreType = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) StoreType = Tp; } // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && (64 <= NumElems * ToSz)) StoreType = MVT::f64; // Bitcast the original vector into a vector of store-size units EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); SmallVector Chains; SDValue Ptr = St->getBasePtr(); // Perform one or more big stores into memory. for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StoreType, ShuffWide, DAG.getIntPtrConstant(i, dl)); SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl); Chains.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. if (VT.getSizeInBits() != 64) return SDValue(); const Function &F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && isa(St->getValue()) && !cast(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { LoadSDNode *Ld = cast(St->getValue().getNode()); SmallVector Ops; if (!ISD::isNormalLoad(Ld)) return SDValue(); // If this is not the MMX case, i.e. we are just turning i64 load/store // into f64 load/store, avoid the transformation if there are multiple // uses of the loaded value. if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) return SDValue(); SDLoc LdDL(Ld); SDLoc StDL(N); // If we are a 64-bit capable x86, lower to a single movq load/store pair. // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget.is64Bit() || F64IsLegal) { MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); // Make sure new load is placed in same chain order. DAG.makeEquivalentMemoryOrdering(Ld, NewLd); return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(), St->getMemOperand()); } // Otherwise, lower to two pairs of 32-bit loads / stores. SDValue LoAddr = Ld->getBasePtr(); SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL); SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, Ld->getPointerInfo().getWithOffset(4), MinAlign(Ld->getAlignment(), 4), Ld->getMemOperand()->getFlags()); // Make sure new loads are placed in same chain order. DAG.makeEquivalentMemoryOrdering(Ld, LoLd); DAG.makeEquivalentMemoryOrdering(Ld, HiLd); LoAddr = St->getBasePtr(); HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL); SDValue LoSt = DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4), MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } // This is similar to the above case, but here we handle a scalar 64-bit // integer store that is extracted from a vector on a 32-bit target. // If we have SSE2, then we can treat it like a floating-point double // to get past legalization. The execution dependencies fixup pass will // choose the optimal machine instruction for the store if this really is // an integer or v2f32 rather than an f64. if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() && St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue OldExtract = St->getOperand(1); SDValue ExtOp0 = OldExtract.getOperand(0); unsigned VecSize = ExtOp0.getValueSizeInBits(); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0); SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, BitCast, OldExtract.getOperand(1)); return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } return SDValue(); } /// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements /// of its first operand, then on successive elements of its second operand, /// returning the resulting values in a vector. For example, if /// A = < float a0, float a1, float a2, float a3 > /// and /// B = < float b0, float b1, float b2, float b3 > /// then the result of doing a horizontal operation on A and B is /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > // and // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. // At least one of the operands should be a vector shuffle. if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && RHS.getOpcode() != ISD::VECTOR_SHUFFLE) return false; MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to // operate independently on 128-bit lanes. unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts / NumLanes; assert((NumLaneElts % 2 == 0) && "Vector type should have an even number of elements in each lane"); unsigned HalfLaneElts = NumLaneElts/2; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle then pretend it is the shuffle // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: in what follows a default initialized SDValue represents an UNDEF of // type VT. SDValue A, B; SmallVector LMask(NumElts); if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (!LHS.getOperand(0).isUndef()) A = LHS.getOperand(0); if (!LHS.getOperand(1).isUndef()) B = LHS.getOperand(1); ArrayRef Mask = cast(LHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), LMask.begin()); } else { if (!LHS.isUndef()) A = LHS; for (unsigned i = 0; i != NumElts; ++i) LMask[i] = i; } // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; SmallVector RMask(NumElts); if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (!RHS.getOperand(0).isUndef()) C = RHS.getOperand(0); if (!RHS.getOperand(1).isUndef()) D = RHS.getOperand(1); ArrayRef Mask = cast(RHS.getNode())->getMask(); std::copy(Mask.begin(), Mask.end(), RMask.begin()); } else { if (!RHS.isUndef()) C = RHS; for (unsigned i = 0; i != NumElts; ++i) RMask[i] = i; } // Check that the shuffles are both shuffling the same vectors. if (!(A == C && B == D) && !(A == D && B == C)) return false; // If everything is UNDEF then bail out: it would be better to fold to UNDEF. if (!A.getNode() && !B.getNode()) return false; // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) ShuffleVectorSDNode::commuteMask(RMask); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0; i != NumLaneElts; ++i) { int LIdx = LMask[i+l], RIdx = RMask[i+l]; // Ignore any UNDEF components. if (LIdx < 0 || RIdx < 0 || (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. unsigned Src = (i/HalfLaneElts); // each lane is split between srcs int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; if (!(LIdx == Index && RIdx == Index + 1) && !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. return true; } /// Do target-specific dag combines on floating-point adds/subs. static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); bool IsFadd = N->getOpcode() == ISD::FADD; assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); // Try to synthesize horizontal add/sub from adds/subs of shuffles. if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, IsFadd)) { auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } return SDValue(); } /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify /// the codegen. /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDLoc &DL) { assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"); SDValue Src = N->getOperand(0); unsigned Opcode = Src.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) { unsigned TruncSizeInBits = VT.getScalarSizeInBits(); // Repeated operand, so we are only trading one output truncation for // one input truncation. if (Op0 == Op1) return true; // See if either operand has been extended from a smaller/equal size to // the truncation size, allowing a truncation to combine with the extend. unsigned Opcode0 = Op0.getOpcode(); if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND || Opcode0 == ISD::ZERO_EXTEND) && Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) return true; unsigned Opcode1 = Op1.getOpcode(); if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND || Opcode1 == ISD::ZERO_EXTEND) && Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) return true; // See if either operand is a single use constant which can be constant // folded. SDValue BC0 = peekThroughOneUseBitcasts(Op0); SDValue BC1 = peekThroughOneUseBitcasts(Op1); return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) || ISD::isBuildVectorOfConstantSDNodes(BC1.getNode()); }; auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0); SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1); return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1); }; // Don't combine if the operation has other uses. if (!N->isOnlyUserOf(Src.getNode())) return SDValue(); // Only support vector truncation for now. // TODO: i64 scalar math would benefit as well. if (!VT.isVector()) return SDValue(); // In most cases its only worth pre-truncating if we're only facing the cost // of one truncation. // i.e. if one of the inputs will constant fold or the input is repeated. switch (Opcode) { case ISD::AND: case ISD::XOR: case ISD::OR: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegalOrPromote(Opcode, VT) && IsRepeatedOpOrFreeTruncation(Op0, Op1)) return TruncateArithmetic(Op0, Op1); break; } case ISD::MUL: // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its // better to truncate if we have the chance. if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) && !TLI.isOperationLegal(Opcode, SrcVT)) return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); LLVM_FALLTHROUGH; case ISD::ADD: { // TODO: ISD::SUB should be here but interferes with combineSubToSubus. SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(Opcode, VT) && IsRepeatedOpOrFreeTruncation(Op0, Op1)) return TruncateArithmetic(Op0, Op1); break; } } return SDValue(); } /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. static SDValue combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, SmallVector &Regs) { assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || Regs[0].getValueType() == MVT::v2i64)); EVT OutVT = N->getValueType(0); EVT OutSVT = OutVT.getVectorElementType(); EVT InVT = Regs[0].getValueType(); EVT InSVT = InVT.getVectorElementType(); SDLoc DL(N); // First, use mask to unset all bits that won't appear in the result. assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && "OutSVT can only be either i8 or i16."); APInt Mask = APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits()); SDValue MaskVal = DAG.getConstant(Mask, DL, InVT); for (auto &Reg : Regs) Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg); MVT UnpackedVT, PackedVT; if (OutSVT == MVT::i8) { UnpackedVT = MVT::v8i16; PackedVT = MVT::v16i8; } else { UnpackedVT = MVT::v4i32; PackedVT = MVT::v8i16; } // In each iteration, truncate the type by a half size. auto RegNum = Regs.size(); for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); j < e; j *= 2, RegNum /= 2) { for (unsigned i = 0; i < RegNum; i++) Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]); for (unsigned i = 0; i < RegNum / 2; i++) Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], Regs[i * 2 + 1]); } // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and // then extract a subvector as the result since v8i8 is not a legal type. if (OutVT == MVT::v8i8) { Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], DAG.getIntPtrConstant(0, DL)); return Regs[0]; } else if (RegNum > 1) { Regs.resize(RegNum); return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); } else return Regs[0]; } /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG, SmallVector &Regs) { assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); EVT OutVT = N->getValueType(0); SDLoc DL(N); // Shift left by 16 bits, then arithmetic-shift right by 16 bits. SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); for (auto &Reg : Regs) { Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, Subtarget, DAG); Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, Subtarget, DAG); } for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], Regs[i * 2 + 1]); if (Regs.size() > 2) { Regs.resize(Regs.size() / 2); return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); } else return Regs[0]; } /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type /// legalization the truncation will be translated into a BUILD_VECTOR with each /// element that is extracted from a vector and then truncated, and it is /// difficult to do this optimization based on them. static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT OutVT = N->getValueType(0); if (!OutVT.isVector()) return SDValue(); SDValue In = N->getOperand(0); if (!In.getValueType().isSimple()) return SDValue(); EVT InVT = In.getValueType(); unsigned NumElems = OutVT.getVectorNumElements(); // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on // SSE2, and we need to take care of it specially. // AVX512 provides vpmovdb. if (!Subtarget.hasSSE2() || Subtarget.hasAVX2()) return SDValue(); EVT OutSVT = OutVT.getVectorElementType(); EVT InSVT = InVT.getVectorElementType(); if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && NumElems >= 8)) return SDValue(); // SSSE3's pshufb results in less instructions in the cases below. if (Subtarget.hasSSSE3() && NumElems == 8 && ((OutSVT == MVT::i8 && InSVT != MVT::i64) || (InSVT == MVT::i32 && OutSVT == MVT::i16))) return SDValue(); SDLoc DL(N); // Split a long vector into vectors of legal type. unsigned RegNum = InVT.getSizeInBits() / 128; SmallVector SubVec(RegNum); unsigned NumSubRegElts = 128 / InSVT.getSizeInBits(); EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts); for (unsigned i = 0; i < RegNum; i++) SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In, DAG.getIntPtrConstant(i * NumSubRegElts, DL)); // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to // truncate 2 x v4i32 to v8i16. if (Subtarget.hasSSE41() || OutSVT == MVT::i8) return combineVectorTruncationWithPACKUS(N, DAG, SubVec); else if (InSVT == MVT::i32) return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec); else return SDValue(); } /// This function transforms vector truncation of 'extended sign-bits' or /// 'extended zero-bits' values. /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Requires SSE2 but AVX512 has fast truncate. if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) return SDValue(); if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple()) return SDValue(); SDValue In = N->getOperand(0); if (!In.getValueType().isSimple()) return SDValue(); MVT VT = N->getValueType(0).getSimpleVT(); MVT SVT = VT.getScalarType(); MVT InVT = In.getValueType().getSimpleVT(); MVT InSVT = InVT.getScalarType(); // Check we have a truncation suited for PACKSS. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32) return SDValue(); if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); // Use PACKSS if the input has sign-bits that extend all the way to the // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. unsigned NumSignBits = DAG.ComputeNumSignBits(In); unsigned NumPackedBits = std::min(SVT.getSizeInBits(), 16); if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits)) return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); // Use PACKUS if the input has zero-bits that extend all the way to the // packed/truncated value. e.g. masks, zext_in_reg, etc. KnownBits Known; DAG.computeKnownBits(In, Known); unsigned NumLeadingZeroBits = Known.countMinLeadingZeros(); NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits)) return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget); return SDValue(); } static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); // Attempt to pre-truncate inputs to arithmetic ops instead. if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL)) return V; // Try to detect AVG pattern first. if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; // Try to combine truncation with unsigned saturation. if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget)) return Val; // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { SDValue BCSrc = Src.getOperand(0); if (BCSrc.getValueType() == MVT::x86mmx) return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } // Try to truncate extended sign/zero bits with PACKSS/PACKUS. if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget)) return V; return combineVectorTruncation(N, DAG, Subtarget); } /// Returns the negated value if the node \p N flips sign of FP value. /// /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000). /// AVX512F does not have FXOR, so FNEG is lowered as /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))). /// In this case we go though all bitcasts. static SDValue isFNEG(SDNode *N) { if (N->getOpcode() == ISD::FNEG) return N->getOperand(0); SDValue Op = peekThroughBitcasts(SDValue(N, 0)); if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR) return SDValue(); SDValue Op1 = peekThroughBitcasts(Op.getOperand(1)); if (!Op1.getValueType().isFloatingPoint()) return SDValue(); SDValue Op0 = peekThroughBitcasts(Op.getOperand(0)); unsigned EltBits = Op1.getScalarValueSizeInBits(); auto isSignMask = [&](const ConstantFP *C) { return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits); }; // There is more than one way to represent the same constant on // the different X86 targets. The type of the node may also depend on size. // - load scalar value and broadcast // - BUILD_VECTOR node // - load from a constant pool. // We check all variants here. if (Op1.getOpcode() == X86ISD::VBROADCAST) { if (auto *C = getTargetConstantFromNode(Op1.getOperand(0))) if (isSignMask(cast(C))) return Op0; } else if (BuildVectorSDNode *BV = dyn_cast(Op1)) { if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode()) if (isSignMask(CN->getConstantFPValue())) return Op0; } else if (auto *C = getTargetConstantFromNode(Op1)) { if (C->getType()->isVectorTy()) { if (auto *SplatV = C->getSplatValue()) if (isSignMask(cast(SplatV))) return Op0; } else if (auto *FPConst = dyn_cast(C)) if (isSignMask(FPConst)) return Op0; } return SDValue(); } /// Do target-specific dag combines on floating point negations. static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT OrigVT = N->getValueType(0); SDValue Arg = isFNEG(N); assert(Arg.getNode() && "N is expected to be an FNEG node"); EVT VT = Arg.getValueType(); EVT SVT = VT.getScalarType(); SDLoc DL(N); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); // If we're negating a FMUL node on a target with FMA, then we can avoid the // use of a constant by performing (-0 - A*B) instead. // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Zero); return DAG.getBitcast(OrigVT, NewNode); } // If we're negating an FMA node, then we can adjust the // instruction to include the extra negation. unsigned NewOpcode = 0; if (Arg.hasOneUse()) { switch (Arg.getOpcode()) { case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break; case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break; // We can't handle scalar intrinsic node here because it would only // invert one element and not the whole vector. But we could try to handle // a negation of the lower element only. } } if (NewOpcode) return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg.getNode()->ops())); return SDValue(); } static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); // If we have integer vector types available, use the integer opcodes. if (VT.isVector() && Subtarget.hasSSE2()) { SDLoc dl(N); MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0)); SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); unsigned IntOpcode; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected FP logic op"); case X86ISD::FOR: IntOpcode = ISD::OR; break; case X86ISD::FXOR: IntOpcode = ISD::XOR; break; case X86ISD::FAND: IntOpcode = ISD::AND; break; case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; } SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); return DAG.getBitcast(VT, IntOp); } return SDValue(); } /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val) static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() != ISD::XOR) return SDValue(); SDValue LHS = N->getOperand(0); auto *RHSC = dyn_cast(N->getOperand(1)); if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC) return SDValue(); X86::CondCode NewCC = X86::GetOppositeBranchCondition( X86::CondCode(LHS->getConstantOperandVal(0))); SDLoc DL(N); return getSETCC(NewCC, LHS->getOperand(1), DL, DAG); } static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // If this is SSE1 only convert to FXOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && N->getValueType(0) == MVT::v4i32) { return DAG.getBitcast( MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32, DAG.getBitcast(MVT::v4f32, N->getOperand(0)), DAG.getBitcast(MVT::v4f32, N->getOperand(1)))); } if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) return Cmp; if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue SetCC = foldXor1SetCC(N, DAG)) return SetCC; if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) return RV; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; if (isFNEG(N)) return combineFneg(N, DAG, Subtarget); return SDValue(); } static bool isNullFPScalarOrVectorConst(SDValue V) { return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode()); } /// If a value is a scalar FP zero or a vector FP zero (potentially including /// undefined elements), return a zero constant that may be used to fold away /// that value. In the case of a vector, the returned constant will not contain /// undefined elements even if the input parameter does. This makes it suitable /// to be used as a replacement operand with operations (eg, bitwise-and) where /// an undef should not propagate. static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!isNullFPScalarOrVectorConst(V)) return SDValue(); if (V.getValueType().isVector()) return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V)); return V; } static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); // Vector types are handled in combineANDXORWithAllOnesIntoANDNP(). if (!((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::f64 && Subtarget.hasSSE2()) || (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2()))) return SDValue(); auto isAllOnesConstantFP = [](SDValue V) { if (V.getSimpleValueType().isVector()) return ISD::isBuildVectorAllOnes(V.getNode()); auto *C = dyn_cast(V); return C && C->getConstantFPValue()->isAllOnesValue(); }; // fand (fxor X, -1), Y --> fandn X, Y if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1))) return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1); // fand X, (fxor Y, -1) --> fandn Y, X if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1))) return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0); return SDValue(); } /// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FAND(0.0, x) -> 0.0 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget)) return V; // FAND(x, 0.0) -> 0.0 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) return V; if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget)) return V; return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FANDN nodes. static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FANDN(0.0, x) -> x if (isNullFPScalarOrVectorConst(N->getOperand(0))) return N->getOperand(1); // FANDN(x, 0.0) -> 0.0 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) return V; return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x if (isNullFPScalarOrVectorConst(N->getOperand(0))) return N->getOperand(1); // F[X]OR(x, 0.0) -> x if (isNullFPScalarOrVectorConst(N->getOperand(1))) return N->getOperand(0); if (isFNEG(N)) if (SDValue NewVal = combineFneg(N, DAG, Subtarget)) return NewVal; return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); // Only perform optimizations if UnsafeMath is used. if (!DAG.getTarget().Options.UnsafeFPMath) return SDValue(); // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes // into FMINC and FMAXC, which are Commutative operations. unsigned NewOp = 0; switch (N->getOpcode()) { default: llvm_unreachable("unknown opcode"); case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; } return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); } static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Subtarget.useSoftFloat()) return SDValue(); // TODO: Check for global or instruction-level "nnan". In that case, we // should be able to lower to FMAX/FMIN alone. // TODO: If an operand is already known to be a NaN or not a NaN, this // should be an optional swap and FMAX/FMIN. EVT VT = N->getValueType(0); if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) return SDValue(); // This takes at least 3 instructions, so favor a library call when operating // on a scalar and minimizing code size. if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDLoc DL(N); EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( DAG.getDataLayout(), *DAG.getContext(), VT); // There are 4 possibilities involving NaN inputs, and these are the required // outputs: // Op1 // Num NaN // ---------------- // Num | Max | Op0 | // Op0 ---------------- // NaN | Op1 | NaN | // ---------------- // // The SSE FP max/min instructions were not designed for this case, but rather // to implement: // Min = Op1 < Op0 ? Op1 : Op0 // Max = Op1 > Op0 ? Op1 : Op0 // // So they always return Op0 if either input is a NaN. However, we can still // use those instructions for fmaxnum by selecting away a NaN input. // If either operand is NaN, the 2nd source operand (Op0) is passed through. auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax); } /// Do target-specific dag combines on X86ISD::ANDNP nodes. static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // ANDNP(0, x) -> x if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) return N->getOperand(1); // ANDNP(x, 0) -> 0 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())) return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N)); EVT VT = N->getValueType(0); // Attempt to recursively combine a bitmask ANDNP with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) { DCI.CombineTo(N, Res); return SDValue(); } } return SDValue(); } static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // BT ignores high bits in the bit index operand. unsigned BitWidth = N1.getValueSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask)) return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1); return SDValue(); } static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast(N1)->getVT(); SDLoc dl(N); // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the // both SSE and AVX2 since there is no sign-extended shift right // operation on a vector with 64-bit elements. //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND)) { SDValue N00 = N0.getOperand(0); // EXTLOAD has a better solution on AVX2, // it may be replaced with X86ISD::VSEXT node. if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256()) if (!ISD::isNormalLoad(N00.getNode())) return SDValue(); if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1); return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); } } return SDValue(); } /// sext(add_nsw(x, C)) --> add(sext(x), C_sext) /// zext(add_nuw(x, C)) --> add(zext(x), C_zext) /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes /// opportunities to combine math ops, use an LEA, or use a complex addressing /// mode. This can eliminate extend, add, and shift instructions. static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Ext->getOpcode() != ISD::SIGN_EXTEND && Ext->getOpcode() != ISD::ZERO_EXTEND) return SDValue(); // TODO: This should be valid for other integer types. EVT VT = Ext->getValueType(0); if (VT != MVT::i64) return SDValue(); SDValue Add = Ext->getOperand(0); if (Add.getOpcode() != ISD::ADD) return SDValue(); bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND; bool NSW = Add->getFlags().hasNoSignedWrap(); bool NUW = Add->getFlags().hasNoUnsignedWrap(); // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding // into the 'zext' if ((Sext && !NSW) || (!Sext && !NUW)) return SDValue(); // Having a constant operand to the 'add' ensures that we are not increasing // the instruction count because the constant is extended for free below. // A constant operand can also become the displacement field of an LEA. auto *AddOp1 = dyn_cast(Add.getOperand(1)); if (!AddOp1) return SDValue(); // Don't make the 'add' bigger if there's no hope of combining it with some // other 'add' or 'shl' instruction. // TODO: It may be profitable to generate simpler LEA instructions in place // of single 'add' instructions, but the cost model for selecting an LEA // currently has a high threshold. bool HasLEAPotential = false; for (auto *User : Ext->uses()) { if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { HasLEAPotential = true; break; } } if (!HasLEAPotential) return SDValue(); // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'. int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue(); SDValue AddOp0 = Add.getOperand(0); SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0); SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); // The wider add is guaranteed to not wrap because both operands are // sign-extended. SDNodeFlags Flags; Flags.setNoSignedWrap(NSW); Flags.setNoUnsignedWrap(NUW); return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags); } /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y) /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly /// extends from AH (which we otherwise need to do contortions to access). static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); auto OpcodeN = N->getOpcode(); auto OpcodeN0 = N0.getOpcode(); if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) || (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM))) return SDValue(); EVT VT = N->getValueType(0); EVT InVT = N0.getValueType(); if (N0.getResNo() != 1 || InVT != MVT::i8 || !(VT == MVT::i32 || VT == MVT::i64)) return SDValue(); SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32); auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG : X86ISD::UDIVREM8_ZEXT_HREG; SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0), N0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); // If this was a 64-bit extend, complete it. if (VT == MVT::i64) return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1)); return R.getValue(1); } // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant // operands and the result of CMOV is not used anywhere else - promote CMOV // itself instead of promoting its result. This could be beneficial, because: // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two // (or more) pseudo-CMOVs only when they go one-after-another and // getting rid of result extension code after CMOV will help that. // 2) Promotion of constant CMOV arguments is free, hence the // {ANY,SIGN,ZERO}_EXTEND will just be deleted. // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this // promotion is also good in terms of code-size. // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit // promotion). static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) { SDValue CMovN = Extend->getOperand(0); if (CMovN.getOpcode() != X86ISD::CMOV) return SDValue(); EVT TargetVT = Extend->getValueType(0); unsigned ExtendOpcode = Extend->getOpcode(); SDLoc DL(Extend); EVT VT = CMovN.getValueType(); SDValue CMovOp0 = CMovN.getOperand(0); SDValue CMovOp1 = CMovN.getOperand(1); bool DoPromoteCMOV = (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) && CMovN.hasOneUse() && (isa(CMovOp0.getNode()) && isa(CMovOp1.getNode())); if (!DoPromoteCMOV) return SDValue(); CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0); CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1); return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1, CMovN.getOperand(2), CMovN.getOperand(3)); } // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)). // This is more or less the reverse of combineBitcastvxi1. static SDValue combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND && Opcode != ISD::ANY_EXTEND) return SDValue(); if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) return SDValue(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); EVT InSVT = N0.getValueType().getScalarType(); unsigned EltSizeInBits = SVT.getSizeInBits(); // Input type must be extending a bool vector (bit-casted from a scalar // integer) to legal integer types. if (!VT.isVector()) return SDValue(); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8) return SDValue(); if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST) return SDValue(); SDValue N00 = N0.getOperand(0); EVT SclVT = N0.getOperand(0).getValueType(); if (!SclVT.isScalarInteger()) return SDValue(); SDLoc DL(N); SDValue Vec; SmallVector ShuffleMask; unsigned NumElts = VT.getVectorNumElements(); assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"); // Broadcast the scalar integer to the vector elements. if (NumElts > EltSizeInBits) { // If the scalar integer is greater than the vector element size, then we // must split it down into sub-sections for broadcasting. For example: // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections. // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections. assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"); unsigned Scale = NumElts / EltSizeInBits; EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); Vec = DAG.getBitcast(VT, Vec); for (unsigned i = 0; i != Scale; ++i) ShuffleMask.append(EltSizeInBits, i); } else { // For smaller scalar integers, we can simply any-extend it to the vector // element size (we don't care about the upper bits) and broadcast it to all // elements. SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); ShuffleMask.append(NumElts, 0); } Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); // Now, mask the relevant bit in each element. SmallVector Bits; for (unsigned i = 0; i != NumElts; ++i) { int BitIdx = (i % EltSizeInBits); APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1); Bits.push_back(DAG.getConstant(Bit, DL, SVT)); } SDValue BitMask = DAG.getBuildVector(VT, DL, Bits); Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask); // Compare against the bitmask and extend the result. EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ); Vec = DAG.getSExtOrTrunc(Vec, DL, VT); // For SEXT, this is now done, otherwise shift the result down for // zero-extension. if (Opcode == ISD::SIGN_EXTEND) return Vec; return DAG.getNode(ISD::SRL, DL, VT, Vec, DAG.getConstant(EltSizeInBits - 1, DL, VT)); } /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating /// with UNDEFs) of the input to vectors of the same size as the target type /// which then extends the lowest elements. static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) return SDValue(); if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget.hasSSE2()) return SDValue(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); EVT InVT = N0.getValueType(); EVT InSVT = InVT.getScalarType(); // Input type must be a vector and we must be extending legal integer types. if (!VT.isVector()) return SDValue(); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) return SDValue(); if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); // On AVX2+ targets, if the input/output types are both legal then we will be // able to use SIGN_EXTEND/ZERO_EXTEND directly. if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && DAG.getTargetLoweringInfo().isTypeLegal(InVT)) return SDValue(); SDLoc DL(N); auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { EVT InVT = N.getValueType(); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), Size / InVT.getScalarSizeInBits()); SmallVector Opnds(Size / InVT.getSizeInBits(), DAG.getUNDEF(InVT)); Opnds[0] = N; return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); }; // If target-size is less than 128-bits, extend to a type that would extend // to 128 bits, extend that and extract the original target vector. if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) { unsigned Scale = 128 / VT.getSizeInBits(); EVT ExVT = EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, DAG.getIntPtrConstant(0, DL)); } // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || (VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.hasAVX512())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) : DAG.getZeroExtendVectorInReg(ExOp, DL, VT); } auto SplitAndExtendInReg = [&](unsigned SplitSize) { unsigned NumVecs = VT.getSizeInBits() / SplitSize; unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); SmallVector Opnds; for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, DAG.getIntPtrConstant(Offset, DL)); SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); SrcVec = Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT) : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT); Opnds.push_back(SrcVec); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); }; // On pre-AVX2 targets, split into 128-bit nodes of // ISD::*_EXTEND_VECTOR_INREG. if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) return SplitAndExtendInReg(128); // On pre-AVX512 targets, split into 256-bit nodes of // ISD::*_EXTEND_VECTOR_INREG. if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256)) return SplitAndExtendInReg(256); return SDValue(); } static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = N0.getValueType(); SDLoc DL(N); if (SDValue DivRem8 = getDivRem8(N, DAG)) return DivRem8; if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) return NewCMov; if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR && isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) { // Invert and sign-extend a boolean is the same as zero-extend and subtract // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1. // sext (xor Bool, -1) --> sub (zext Bool), 1 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT)); } if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; if (VT.isVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) return NewAdd; return SDValue(); } static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode. SDLoc dl(N); EVT VT = N->getValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); EVT ScalarVT = VT.getScalarType(); if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); auto invertIfNegative = [](SDValue &V) { if (SDValue NegVal = isFNEG(V.getNode())) { V = NegVal; return true; } return false; }; // Do not convert the passthru input of scalar intrinsics. // FIXME: We could allow negations of the lower element only. bool NegA = N->getOpcode() != X86ISD::FMADDS1 && N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); bool NegB = invertIfNegative(B); bool NegC = N->getOpcode() != X86ISD::FMADDS3 && N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); // Negative multiplication when NegA xor NegB bool NegMul = (NegA != NegB); bool HasNeg = NegA || NegB || NegC; unsigned NewOpcode; if (!NegMul) NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB); else NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; // For FMA, we risk reconstructing the node we started with. // In order to avoid this, we check for negation or opcode change. If // one of the two happened, then it is a new node and we return it. if (N->getOpcode() == ISD::FMA) { if (HasNeg || NewOpcode != N->getOpcode()) return DAG.getNode(NewOpcode, dl, VT, A, B, C); return SDValue(); } if (N->getOpcode() == X86ISD::FMADD_RND) { switch (NewOpcode) { case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; } } else if (N->getOpcode() == X86ISD::FMADDS1) { switch (NewOpcode) { case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break; } } else if (N->getOpcode() == X86ISD::FMADDS3) { switch (NewOpcode) { case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break; } } else if (N->getOpcode() == X86ISD::FMADDS1_RND) { switch (NewOpcode) { case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break; } } else if (N->getOpcode() == X86ISD::FMADDS3_RND) { switch (NewOpcode) { case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break; } } else if (N->getOpcode() == X86ISD::FMADD4S) { switch (NewOpcode) { case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break; } } else { llvm_unreachable("Unexpected opcode!"); } // Only return the node is the opcode was changed or one of the // operand was negated. If not, we'll just recreate the same node. if (HasNeg || NewOpcode != N->getOpcode()) { if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, A, B, C); } return SDValue(); } // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue NegVal = isFNEG(N->getOperand(2).getNode()); if (!NegVal) return SDValue(); unsigned NewOpcode; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break; case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break; case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break; case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break; } if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), NegVal, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), NegVal); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because // ISD::SETCC is always legalized to i8. SDLoc dl(N); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { if (!isOneConstant(N0.getOperand(1))) return SDValue(); return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, dl, VT)); } } if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, dl, VT)); } } if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) return NewCMov; if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; if (VT.isVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; if (SDValue DivRem8 = getDivRem8(N, DAG)) return DivRem8; if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) return NewAdd; if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) return R; return SDValue(); } /// Try to map a 128-bit or larger integer comparison to vector instructions /// before type legalization splits it up into chunks. static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"); // We're looking for an oversized integer equality comparison, but ignore a // comparison with zero because that gets special treatment in EmitTest(). SDValue X = SetCC->getOperand(0); SDValue Y = SetCC->getOperand(1); EVT OpVT = X.getValueType(); unsigned OpSize = OpVT.getSizeInBits(); if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y)) return SDValue(); // Bail out if we know that this is not really just an oversized integer. if (peekThroughBitcasts(X).getValueType() == MVT::f128 || peekThroughBitcasts(Y).getValueType() == MVT::f128) return SDValue(); // TODO: Use PXOR + PTEST for SSE4.1 or later? // TODO: Add support for AVX-512. EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); if ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && Subtarget.hasAVX2())) { EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8; SDValue VecX = DAG.getBitcast(VecVT, X); SDValue VecY = DAG.getBitcast(VecVT, Y); // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY); SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL, MVT::i32); return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC); } return SDValue(); } static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { ISD::CondCode CC = cast(N->getOperand(2))->get(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if (CC == ISD::SETNE || CC == ISD::SETEQ) { EVT OpVT = LHS.getValueType(); // 0-x == y --> x+y == 0 // 0-x != y --> x+y != 0 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1)); return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } // x == 0-y --> x+y == 0 // x != 0-y --> x+y != 0 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget)) return V; } if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { // Put build_vectors on the right. if (LHS.getOpcode() == ISD::BUILD_VECTOR) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); } bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1); bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); if (IsSEXT0 && IsVZero1) { assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); if (CC == ISD::SETGT) return DAG.getConstant(0, DL, VT); if (CC == ISD::SETLE) return DAG.getConstant(1, DL, VT); if (CC == ISD::SETEQ || CC == ISD::SETGE) return DAG.getNOT(DL, LHS.getOperand(0), VT); assert((CC == ISD::SETNE || CC == ISD::SETLT) && "Unexpected condition code!"); return LHS.getOperand(0); } } // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early // to avoid scalarization via legalization because v4i32 is not a legal type. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 && LHS.getValueType() == MVT::v4f32) return LowerVSETCC(SDValue(N, 0), Subtarget, DAG); return SDValue(); } static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); // MOVMSK only uses the MSB from each vector element. KnownBits Known; APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) { DCI.AddToWorklist(Src.getNode()); DCI.CommitTargetLoweringOpt(TLO); return SDValue(N, 0); } return SDValue(); } static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); if (DCI.isBeforeLegalizeOps()) { SDValue Index = N->getOperand(4); // Remove any sign extends from 32 or smaller to larger than 32. // Only do this before LegalizeOps in case we need the sign extend for // legalization. if (Index.getOpcode() == ISD::SIGN_EXTEND) { if (Index.getScalarValueSizeInBits() > 32 && Index.getOperand(0).getScalarValueSizeInBits() <= 32) { SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[4] = Index.getOperand(0); DAG.UpdateNodeOperands(N, NewOps); // The original sign extend has less users, add back to worklist in case // it needs to be removed DCI.AddToWorklist(Index.getNode()); DCI.AddToWorklist(N); return SDValue(N, 0); } } // Make sure the index is either i32 or i64 unsigned ScalarSize = Index.getScalarValueSizeInBits(); if (ScalarSize != 32 && ScalarSize != 64) { MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32; EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT, Index.getValueType().getVectorNumElements()); Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[4] = Index; DAG.UpdateNodeOperands(N, NewOps); DCI.AddToWorklist(N); return SDValue(N, 0); } // Try to remove zero extends from 32->64 if we know the sign bit of // the input is zero. if (Index.getOpcode() == ISD::ZERO_EXTEND && Index.getScalarValueSizeInBits() == 64 && Index.getOperand(0).getScalarValueSizeInBits() == 32) { if (DAG.SignBitIsZero(Index.getOperand(0))) { SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[4] = Index.getOperand(0); DAG.UpdateNodeOperands(N, NewOps); // The original zero extend has less users, add back to worklist in case // it needs to be removed DCI.AddToWorklist(Index.getNode()); DCI.AddToWorklist(N); return SDValue(N, 0); } } } // Gather and Scatter instructions use k-registers for masks. The type of // the masks is v*i1. So the mask will be truncated anyway. // The SIGN_EXTEND_INREG my be dropped. SDValue Mask = N->getOperand(2); if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[2] = Mask.getOperand(0); DAG.UpdateNodeOperands(N, NewOps); return SDValue(N, 0); } // With AVX2 we only demand the upper bit of the mask. if (!Subtarget.hasAVX512()) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); KnownBits Known; APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) { DCI.AddToWorklist(Mask.getNode()); DCI.CommitTargetLoweringOpt(TLO); return SDValue(N, 0); } } return SDValue(); } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); // Try to simplify the EFLAGS and condition code operands. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) return getSETCC(CC, Flags, DL, DAG); return SDValue(); } /// Optimize branch condition evaluation. static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue EFLAGS = N->getOperand(3); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); // Try to simplify the EFLAGS and condition code operands. // Make sure to not keep references to operands, as combineSetCCEFLAGS can // RAUW them under us. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), N->getOperand(1), Cond, Flags); } return SDValue(); } static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to // optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> // AND(VECTOR_CMP(x,y), constant2) // constant2 = UNARYOP(constant) // Early exit if this isn't a vector operation, the operand of the // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast(N->getOperand(0)->getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. SDLoc DL(N); EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, N->getOperand(0)->getOperand(0), MaskConst); SDValue Res = DAG.getBitcast(VT, NewAnd); return Res; } return SDValue(); } static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); EVT InSVT = InVT.getScalarType(); // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP. return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. if (DAG.SignBitIsZero(Op0)) return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0); return SDValue(); } static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG)) return Res; // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); EVT InSVT = InVT.getScalarType(); // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32)) // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16 || (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Without AVX512DQ we only support i64 to float scalar conversion. For both // vectors and scalars, see if we know that the upper bits are all the sign // bit, in which case we can truncate the input to i32 and convert from that. if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) { unsigned BitWidth = InVT.getScalarSizeInBits(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op0); if (NumSignBits >= (BitWidth - 31)) { EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32); if (InVT.isVector()) TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, InVT.getVectorNumElements()); SDLoc dl(N); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); } } // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast(Op0.getNode()); EVT LdVT = Ld->getValueType(0); // This transformation is not supported if the result type is f16 or f128. if (VT == MVT::f16 || VT == MVT::f128) return SDValue(); if (!Ld->isVolatile() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !Subtarget.is64Bit() && LdVT == MVT::i64) { SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD( SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; } } return SDValue(); } static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, N->getOperand(0), N->getOperand(1), Flags); } return SDValue(); } // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // If the LHS and RHS of the ADC node are zero, then it can't overflow and // the result is either zero or one (depending on the input carry bit). // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. if (X86::isZeroNode(N->getOperand(0)) && X86::isZeroNode(N->getOperand(1)) && // We don't have a good way to replace an EFLAGS use, so only do this when // dead right now. SDValue(N, 1).use_empty()) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), N->getOperand(2)), DAG.getConstant(1, DL, VT)); return DCI.CombineTo(N, Res1, CarryOut); } if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, N->getOperand(0), N->getOperand(1), Flags); } return SDValue(); } /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit /// which is more useful than 0/1 in some cases. static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) { SDLoc DL(N); // "Condition code B" is also known as "the carry flag" (CF). SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8); SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS); MVT VT = N->getSimpleValueType(0); if (VT == MVT::i8) return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT)); assert(VT == MVT::i1 && "Unexpected type for SETCC node"); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB); } /// If this is an add or subtract where one operand is produced by a cmp+setcc, /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} /// with CMP+{ADC, SBB}. static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { bool IsSub = N->getOpcode() == ISD::SUB; SDValue X = N->getOperand(0); SDValue Y = N->getOperand(1); // If this is an add, canonicalize a zext operand to the RHS. // TODO: Incomplete? What if both sides are zexts? if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND && Y.getOpcode() != ISD::ZERO_EXTEND) std::swap(X, Y); // Look through a one-use zext. bool PeekedThroughZext = false; if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) { Y = Y.getOperand(0); PeekedThroughZext = true; } // If this is an add, canonicalize a setcc operand to the RHS. // TODO: Incomplete? What if both sides are setcc? // TODO: Should we allow peeking through a zext of the other operand? if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC && Y.getOpcode() != X86ISD::SETCC) std::swap(X, Y); if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse()) return SDValue(); SDLoc DL(N); EVT VT = N->getValueType(0); X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0); // If X is -1 or 0, then we have an opportunity to avoid constants required in // the general case below. auto *ConstantX = dyn_cast(X); if (ConstantX) { if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) || (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) { // This is a complicated way to get -1 or 0 from the carry flag: // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), Y.getOperand(1)); } if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) || (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) { SDValue EFLAGS = Y->getOperand(1); if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { // Swap the operands of a SUB, and we have the same pattern as above. // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB SDValue NewSub = DAG.getNode( X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), NewEFLAGS); } } } if (CC == X86::COND_B) { // X + SETB Z --> X + (mask SBB Z, Z) // X - SETB Z --> X - (mask SBB Z, Z) // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY? SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG); if (SBB.getValueSizeInBits() != VT.getSizeInBits()) SBB = DAG.getZExtOrTrunc(SBB, DL, VT); return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); } if (CC == X86::COND_A) { SDValue EFLAGS = Y->getOperand(1); // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG); if (SBB.getValueSizeInBits() != VT.getSizeInBits()) SBB = DAG.getZExtOrTrunc(SBB, DL, VT); return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); } } if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); SDValue Cmp = Y.getOperand(1); if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || !X86::isZeroNode(Cmp.getOperand(1)) || !Cmp.getOperand(0).getValueType().isInteger()) return SDValue(); SDValue Z = Cmp.getOperand(0); EVT ZVT = Z.getValueType(); // If X is -1 or 0, then we have an opportunity to avoid constants required in // the general case below. if (ConstantX) { // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with // fake operands: // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z) if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) || (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) { SDValue Zero = DAG.getConstant(0, DL, ZVT); SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), SDValue(Neg.getNode(), 1)); } // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb' // with fake operands: // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1) // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1) if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) || (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) { SDValue One = DAG.getConstant(1, DL, ZVT); SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1); } } // (cmp Z, 1) sets the carry flag if Z is 0. SDValue One = DAG.getConstant(1, DL, ZVT); SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); // Add the flags type for ADC/SBB nodes. SDVTList VTs = DAG.getVTList(VT, MVT::i32); // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) if (CC == X86::COND_NE) return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, DAG.getConstant(-1ULL, DL, VT), Cmp1); // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, DAG.getConstant(0, DL, VT), Cmp1); } static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); SDValue MulOp = N->getOperand(0); SDValue Phi = N->getOperand(1); if (MulOp.getOpcode() != ISD::MUL) std::swap(MulOp, Phi); if (MulOp.getOpcode() != ISD::MUL) return SDValue(); ShrinkMode Mode; if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16) return SDValue(); EVT VT = N->getValueType(0); unsigned RegSize = 128; if (Subtarget.hasBWI()) RegSize = 512; else if (Subtarget.hasAVX2()) RegSize = 256; unsigned VectorSize = VT.getVectorNumElements() * 16; // If the vector size is less than 128, or greater than the supported RegSize, // do not use PMADD. if (VectorSize < 128 || VectorSize > RegSize) return SDValue(); SDLoc DL(N); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, VT.getVectorNumElements() / 2); // Shrink the operands of mul. SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0)); SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); // Madd vector size is half of the original vector size SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1); // Fill the rest of the output with 0 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi); } static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // TODO: There's nothing special about i32, any integer type above i16 should // work just as well. if (!VT.isVector() || !VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) return SDValue(); unsigned RegSize = 128; if (Subtarget.hasBWI()) RegSize = 512; else if (Subtarget.hasAVX2()) RegSize = 256; // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. if (VT.getSizeInBits() / 4 > RegSize) return SDValue(); // We know N is a reduction add, which means one of its operands is a phi. // To match SAD, we need the other operand to be a vector select. SDValue SelectOp, Phi; if (Op0.getOpcode() == ISD::VSELECT) { SelectOp = Op0; Phi = Op1; } else if (Op1.getOpcode() == ISD::VSELECT) { SelectOp = Op1; Phi = Op0; } else return SDValue(); // Check whether we have an abs-diff pattern feeding into the select. if(!detectZextAbsDiff(SelectOp, Op0, Op1)) return SDValue(); // SAD pattern detected. Now build a SAD instruction and an addition for // reduction. Note that the number of elements of the result of SAD is less // than the number of elements of its input. Therefore, we could only update // part of elements in the reduction vector. SDValue Sad = createPSADBW(DAG, Op0, Op1, DL); // The output of PSADBW is a vector of i64. // We need to turn the vector of i64 into a vector of i32. // If the reduction vector is at least as wide as the psadbw result, just // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero // anyway. MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); if (VT.getSizeInBits() >= ResVT.getSizeInBits()) Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); else Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad); if (VT.getSizeInBits() > ResVT.getSizeInBits()) { // Fill the upper elements with zero to match the add width. SDValue Zero = DAG.getConstant(0, DL, VT); Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad, DAG.getIntPtrConstant(0, DL)); } return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); } /// Convert vector increment or decrement to sub/add with an all-ones constant: /// add X, <1, 1...> --> sub X, <-1, -1...> /// sub X, <1, 1...> --> add X, <-1, -1...> /// The all-ones vector constant can be materialized using a pcmpeq instruction /// that is commonly recognized as an idiom (has no register dependency), so /// that's better/smaller than loading a splat 1 constant. static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) { assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && "Unexpected opcode for increment/decrement transform"); // Pseudo-legality check: getOnesVector() expects one of these types, so bail // out and wait for legalization if we have an unsupported vector length. EVT VT = N->getValueType(0); if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); SDNode *N1 = N->getOperand(1).getNode(); APInt SplatVal; if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue()) return SDValue(); SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N)); unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec); } static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { const SDNodeFlags Flags = N->getFlags(); if (Flags.hasVectorReduction()) { if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) return Sad; if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget)) return MAdd; } EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); if (SDValue V = combineIncDecVector(N, DAG)) return V; return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); EVT VT = N->getValueType(0); // PSUBUS is supported, starting from SSE2, but special preprocessing // for v8i32 requires umin, which appears in SSE41. if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) && !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) && !(Subtarget.hasAVX512() && Subtarget.hasBWI() && (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 || VT == MVT::v8i64))) return SDValue(); SDValue SubusLHS, SubusRHS; // Try to find umax(a,b) - b or a - umin(a,b) patterns // they may be converted to subus(a,b). // TODO: Need to add IR cannonicialization for this code. if (Op0.getOpcode() == ISD::UMAX) { SubusRHS = Op1; SDValue MaxLHS = Op0.getOperand(0); SDValue MaxRHS = Op0.getOperand(1); if (MaxLHS == Op1) SubusLHS = MaxRHS; else if (MaxRHS == Op1) SubusLHS = MaxLHS; else return SDValue(); } else if (Op1.getOpcode() == ISD::UMIN) { SubusLHS = Op0; SDValue MinLHS = Op1.getOperand(0); SDValue MinRHS = Op1.getOperand(1); if (MinLHS == Op0) SubusRHS = MinRHS; else if (MinRHS == Op0) SubusRHS = MinLHS; else return SDValue(); } else return SDValue(); // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with // special preprocessing in some cases. if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64) return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS); // Special preprocessing case can be only applied // if the value was zero extended from 16 bit, // so we require first 16 bits to be zeros for 32 bit // values, or first 48 bits for 64 bit values. KnownBits Known; DAG.computeKnownBits(SubusLHS, Known); unsigned NumZeros = Known.countMinLeadingZeros(); if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16) return SDValue(); EVT ExtType = SubusLHS.getValueType(); EVT ShrinkedType; if (VT == MVT::v8i32 || VT == MVT::v8i64) ShrinkedType = MVT::v8i16; else ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16; // If SubusLHS is zeroextended - truncate SubusRHS to it's // size SubusRHS = umin(0xFFF.., SubusRHS). SDValue SaturationConst = DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(), ShrinkedType.getScalarSizeInBits()), SDLoc(SubusLHS), ExtType); SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS, SaturationConst); SDValue NewSubusLHS = DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType); SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType); SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType, NewSubusLHS, NewSubusRHS); // Zero extend the result, it may be used somewhere as 32 bit, // if not zext and following trunc will shrink. return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // X86 can't encode an immediate LHS of a sub. See if we can push the // negation into a preceding instruction. if (ConstantSDNode *C = dyn_cast(Op0)) { // If the RHS of the sub is a XOR with one use and a constant, invert the // immediate. Then add one to the LHS of the sub so we can turn // X-Y -> X+~Y+1, saving one register. if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && isa(Op1.getOperand(1))) { APInt XorC = cast(Op1.getOperand(1))->getAPIntValue(); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), DAG.getConstant(~XorC, SDLoc(Op1), VT)); return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT)); } } // Try to synthesize horizontal subs from subs of shuffles. EVT VT = N->getValueType(0); if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, false)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); if (SDValue V = combineIncDecVector(N, DAG)) return V; // Try to create PSUBUS if SUB's argument is max/min if (SDValue V = combineSubToSubus(N, DAG, Subtarget)) return V; return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalize()) return SDValue(); SDLoc DL(N); unsigned Opcode = N->getOpcode(); MVT VT = N->getSimpleValueType(0); MVT SVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = SVT.getSizeInBits(); SDValue Op = N->getOperand(0); MVT OpVT = Op.getSimpleValueType(); MVT OpEltVT = OpVT.getVectorElementType(); unsigned OpEltSizeInBits = OpEltVT.getSizeInBits(); unsigned InputBits = OpEltSizeInBits * NumElts; // Perform any constant folding. // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled. APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) { APInt Undefs(NumElts, 0); SmallVector Vals(NumElts, APInt(EltSizeInBits, 0)); bool IsZEXT = (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG); for (unsigned i = 0; i != NumElts; ++i) { if (UndefElts[i]) { Undefs.setBit(i); continue; } Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits) : EltBits[i].sextOrTrunc(EltSizeInBits); } return getConstVector(Vals, Undefs, VT, DAG, DL); } // (vzext (bitcast (vzext (x)) -> (vzext x) // TODO: (vsext (bitcast (vsext (x)) -> (vsext x) SDValue V = peekThroughBitcasts(Op); if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) { MVT InnerVT = V.getSimpleValueType(); MVT InnerEltVT = InnerVT.getVectorElementType(); // If the element sizes match exactly, we can just do one larger vzext. This // is always an exact type match as vzext operates on integer types. if (OpEltVT == InnerEltVT) { assert(OpVT == InnerVT && "Types must match for vzext!"); return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); } // The only other way we can combine them is if only a single element of the // inner vzext is used in the input to the outer vzext. if (InnerEltVT.getSizeInBits() < InputBits) return SDValue(); // In this case, the inner vzext is completely dead because we're going to // only look at bits inside of the low element. Just do the outer vzext on // a bitcast of the input to the inner. return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V)); } // Check if we can bypass extracting and re-inserting an element of an input // vector. Essentially: // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) // TODO: Add X86ISD::VSEXT support if (Opcode == X86ISD::VZEXT && V.getOpcode() == ISD::SCALAR_TO_VECTOR && V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { SDValue ExtractedV = V.getOperand(0); SDValue OrigV = ExtractedV.getOperand(0); if (isNullConstant(ExtractedV.getOperand(1))) { MVT OrigVT = OrigV.getSimpleValueType(); // Extract a subvector if necessary... if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), OrigVT.getVectorNumElements() / Ratio); OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, DAG.getIntPtrConstant(0, DL)); } Op = DAG.getBitcast(OpVT, OrigV); return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); } } return SDValue(); } static SDValue combineTestM(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); MVT VT = N->getSimpleValueType(0); SDLoc DL(N); // TEST (AND a, b) ,(AND a, b) -> TEST a, b if (Op0 == Op1 && Op1->getOpcode() == ISD::AND) return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0), Op0->getOperand(1)); // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero) // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero) if (ISD::isBuildVectorAllZeros(Op0.getNode()) || ISD::isBuildVectorAllZeros(Op1.getNode())) return getZeroVector(VT, Subtarget, DAG, DL); return SDValue(); } static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); SDLoc DL(N); if (N->getOperand(0) == N->getOperand(1)) { if (N->getOpcode() == X86ISD::PCMPEQ) return getOnesVector(VT, DAG, DL); if (N->getOpcode() == X86ISD::PCMPGT) return getZeroVector(VT, Subtarget, DAG, DL); } return SDValue(); } static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); MVT OpVT = N->getSimpleValueType(0); // Early out for mask vectors. if (OpVT.getVectorElementType() == MVT::i1) return SDValue(); SDLoc dl(N); SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); unsigned IdxVal = N->getConstantOperandVal(2); MVT SubVecVT = SubVec.getSimpleValueType(); if (ISD::isBuildVectorAllZeros(Vec.getNode())) { // Inserting zeros into zeros is a nop. if (ISD::isBuildVectorAllZeros(SubVec.getNode())) return Vec; // If we're inserting into a zero vector and then into a larger zero vector, // just insert into the larger zero vector directly. if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) { unsigned Idx2Val = SubVec.getConstantOperandVal(2); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec.getOperand(1), DAG.getIntPtrConstant(IdxVal + Idx2Val, dl)); } // If we're inserting a bitcast into zeros, rewrite the insert and move the // bitcast to the other side. This helps with detecting zero extending // during isel. // TODO: Is this useful for other indices than 0? if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) { MVT CastVT = SubVec.getOperand(0).getSimpleValueType(); unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits(); MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems); SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getBitcast(NewVT, Vec), SubVec.getOperand(0), N->getOperand(2)); return DAG.getBitcast(OpVT, Insert); } } // If this is an insert of an extract, combine to a shuffle. Don't do this // if the insert or extract can be represented with a subregister operation. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubVec.getOperand(0).getSimpleValueType() == OpVT && (IdxVal != 0 || !Vec.isUndef())) { int ExtIdxVal = SubVec.getConstantOperandVal(1); if (ExtIdxVal != 0) { int VecNumElts = OpVT.getVectorNumElements(); int SubVecNumElts = SubVecVT.getVectorNumElements(); SmallVector Mask(VecNumElts); // First create an identity shuffle mask. for (int i = 0; i != VecNumElts; ++i) Mask[i] = i; // Now insert the extracted portion. for (int i = 0; i != SubVecNumElts; ++i) Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask); } } // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte // load: // (insert_subvector (insert_subvector undef, (load16 addr), 0), // (load16 addr + 16), Elts/2) // --> load32 addr // or: // (insert_subvector (insert_subvector undef, (load32 addr), 0), // (load32 addr + 32), Elts/2) // --> load64 addr // or a 16-byte or 32-byte broadcast: // (insert_subvector (insert_subvector undef, (load16 addr), 0), // (load16 addr), Elts/2) // --> X86SubVBroadcast(load16 addr) // or: // (insert_subvector (insert_subvector undef, (load32 addr), 0), // (load32 addr), Elts/2) // --> X86SubVBroadcast(load32 addr) if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { auto *Idx2 = dyn_cast(Vec.getOperand(2)); if (Idx2 && Idx2->getZExtValue() == 0) { SDValue SubVec2 = Vec.getOperand(1); // If needed, look through bitcasts to get to the load. if (auto *FirstLd = dyn_cast(peekThroughBitcasts(SubVec2))) { bool Fast; unsigned Alignment = FirstLd->getAlignment(); unsigned AS = FirstLd->getAddressSpace(); const X86TargetLowering *TLI = Subtarget.getTargetLowering(); if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), OpVT, AS, Alignment, &Fast) && Fast) { SDValue Ops[] = {SubVec2, SubVec}; if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, Subtarget, false)) return Ld; } } // If lower/upper loads are the same and the only users of the load, then // lower to a VBROADCASTF128/VBROADCASTI128/etc. if (auto *Ld = dyn_cast(peekThroughOneUseBitcasts(SubVec2))) if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); // If this is subv_broadcast insert into both halves, use a larger // subv_broadcast. if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec.getOperand(0)); // If we're inserting all zeros into the upper half, change this to // an insert into an all zeros vector. We will match this to a move // with implicit upper bit zeroing during isel. if (ISD::isBuildVectorAllZeros(SubVec.getNode())) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2, Vec.getOperand(2)); // If we are inserting into both halves of the vector, the starting // vector should be undef. If it isn't, make it so. Only do this if the // the early insert has no other uses. // TODO: Should this be a generic DAG combine? if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) { Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), SubVec2, Vec.getOperand(2)); DCI.AddToWorklist(Vec.getNode()); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, N->getOperand(2)); } } } return SDValue(); } static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); MVT OpVT = N->getSimpleValueType(0); SDValue InVec = N->getOperand(0); unsigned IdxVal = cast(N->getOperand(1))->getZExtValue(); if (ISD::isBuildVectorAllZeros(InVec.getNode())) return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N)); if (ISD::isBuildVectorAllOnes(InVec.getNode())) { if (OpVT.getScalarType() == MVT::i1) return DAG.getConstant(1, SDLoc(N), OpVT); return getOnesVector(OpVT, DAG, SDLoc(N)); } if (InVec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector( OpVT, SDLoc(N), InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements())); return SDValue(); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; case ISD::EXTRACT_VECTOR_ELT: case X86ISD::PEXTRW: case X86ISD::PEXTRB: return combineExtractVectorElt(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::EXTRACT_SUBVECTOR: return combineExtractSubvector(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); case X86ISD::SBB: return combineSBB(N, DAG); case X86ISD::ADC: return combineADC(N, DAG, DCI); case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); case ISD::SHL: case ISD::SRA: case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget); case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget); case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget); case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return combineFOr(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); case X86ISD::PACKSS: case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget); case X86ISD::VSHLI: case X86ISD::VSRAI: case X86ISD::VSRLI: return combineVectorShiftImm(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: case X86ISD::VSEXT: case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); case X86ISD::PINSRB: case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::VBROADCAST: case X86ISD::VPPERM: case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VPERMIV3: case X86ISD::VPERMIL2: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::VZEXT_MOVL: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case X86ISD::FMADD_RND: case X86ISD::FMADDS1_RND: case X86ISD::FMADDS3_RND: case X86ISD::FMADDS1: case X86ISD::FMADDS3: case X86ISD::FMADD4S: case ISD::FMA: return combineFMA(N, DAG, Subtarget); case X86ISD::FMADDSUB_RND: case X86ISD::FMSUBADD_RND: case X86ISD::FMADDSUB: case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget); case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI); case X86ISD::MGATHER: case X86ISD::MSCATTER: case ISD::MGATHER: case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget); case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); } return SDValue(); } /// Return true if the target has native support for the specified value type /// and it is 'desirable' to use the type for the given node type. e.g. On x86 /// i16 is legal, but undesirable since i16 instruction encodings are longer and /// some i16 instructions are slow. bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; if (VT != MVT::i16) return true; switch (Opc) { default: return true; case ISD::LOAD: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::SHL: case ISD::SRL: case ISD::SUB: case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: return false; } } /// This function checks if any of the users of EFLAGS copies the EFLAGS. We /// know that the code that lowers COPY of EFLAGS has to use the stack, and if /// we don't adjust the stack we clobber the first frame index. /// See X86InstrInfo::copyPhysReg. static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) { const MachineRegisterInfo &MRI = MF.getRegInfo(); return any_of(MRI.reg_instructions(X86::EFLAGS), [](const MachineInstr &RI) { return RI.isCopy(); }); } void X86TargetLowering::finalizeLowering(MachineFunction &MF) const { if (hasCopyImplyingStackAdjustment(MF)) { MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setHasCopyImplyingStackAdjustment(true); } TargetLoweringBase::finalizeLowering(MF); } /// This method query the target whether it is beneficial for dag combiner to /// promote the specified node. If true, it should return the desired promotion /// type by reference. bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { EVT VT = Op.getValueType(); if (VT != MVT::i16) return false; bool Promote = false; bool Commute = false; switch (Op.getOpcode()) { default: break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Promote = true; break; case ISD::SHL: case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) return false; Promote = true; break; } case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: Commute = true; LLVM_FALLTHROUGH; case ISD::SUB: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); if (!Commute && MayFoldLoad(N1)) return false; // Avoid disabling potential load folding opportunities. if (MayFoldLoad(N0) && (!isa(N1) || MayFoldIntoStore(Op))) return false; if (MayFoldLoad(N1) && (!isa(N0) || MayFoldIntoStore(Op))) return false; Promote = true; } } PVT = MVT::i32; return Promote; } bool X86TargetLowering:: isDesirableToCombineBuildVectorToShuffleTruncate( ArrayRef ShuffleMask, EVT SrcVT, EVT TruncVT) const { assert(SrcVT.getVectorNumElements() == ShuffleMask.size() && "Element count mismatch"); assert( Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && "Shuffle Mask expected to be legal"); // For 32-bit elements VPERMD is better than shuffle+truncate. // TODO: After we improve lowerBuildVector, add execption for VPERMW. if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2()) return false; if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask)) return false; return true; } //===----------------------------------------------------------------------===// // X86 Inline Assembly Support //===----------------------------------------------------------------------===// // Helper to match a string separated by whitespace. static bool matchAsm(StringRef S, ArrayRef Pieces) { S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. for (StringRef Piece : Pieces) { if (!S.startswith(Piece)) // Check if the piece matches. return false; S = S.substr(Piece.size()); StringRef::size_type Pos = S.find_first_not_of(" \t"); if (Pos == 0) // We matched a prefix. return false; S = S.substr(Pos); } return S.empty(); } static bool clobbersFlagRegisters(const SmallVector &AsmPieces) { if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { if (AsmPieces.size() == 3) return true; else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) return true; } } return false; } bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast(CI->getCalledValue()); const std::string &AsmStr = IA->getAsmString(); IntegerType *Ty = dyn_cast(CI->getType()); if (!Ty || Ty->getBitWidth() % 16 != 0) return false; // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" SmallVector AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); switch (AsmPieces.size()) { default: return false; case 1: // FIXME: this should verify that we are targeting a 486 or better. If not, // we will turn this bswap into something that will be lowered to logical // ops instead of emitting the bswap asm. For now, we don't support 486 or // lower so don't worry about this. // bswap $0 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || matchAsm(AsmPieces[0], {"bswapl", "$0"}) || matchAsm(AsmPieces[0], {"bswapq", "$0"}) || matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. return IntrinsicLowering::LowerToByteSwap(CI); } // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { AsmPieces.clear(); StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: if (CI->getType()->isIntegerTy(32) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { AsmPieces.clear(); StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } if (CI->getType()->isIntegerTy(64)) { InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); if (Constraints.size() >= 2 && Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && matchAsm(AsmPieces[1], {"bswap", "%edx"}) && matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) return IntrinsicLowering::LowerToByteSwap(CI); } } break; } return false; } /// Given a constraint letter, return the type of constraint for this target. X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'R': case 'q': case 'Q': case 'f': case 't': case 'u': case 'y': case 'x': case 'v': case 'Y': case 'l': case 'k': // AVX512 masking registers. return C_RegisterClass; case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': return C_Register; case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'G': case 'C': case 'e': case 'Z': return C_Other; default: break; } } else if (Constraint.size() == 2) { switch (Constraint[0]) { default: break; case 'Y': switch (Constraint[1]) { default: break; case 'z': case '0': return C_Register; case 'i': case 'm': case 'k': case 't': case '2': return C_RegisterClass; } } } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight X86TargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); LLVM_FALLTHROUGH; case 'R': case 'q': case 'Q': case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': if (CallOperandVal->getType()->isIntegerTy()) weight = CW_SpecificReg; break; case 'f': case 't': case 'u': if (type->isFloatingPointTy()) weight = CW_SpecificReg; break; case 'y': if (type->isX86_MMXTy() && Subtarget.hasMMX()) weight = CW_SpecificReg; break; case 'Y': { unsigned Size = StringRef(constraint).size(); // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y' char NextChar = Size == 2 ? constraint[1] : 'i'; if (Size > 2) break; switch (NextChar) { default: return CW_Invalid; // XMM0 case 'z': case '0': if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) return CW_SpecificReg; return CW_Invalid; // Conditional OpMask regs (AVX512) case 'k': if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) return CW_Register; return CW_Invalid; // Any MMX reg case 'm': if (type->isX86_MMXTy() && Subtarget.hasMMX()) return weight; return CW_Invalid; // Any SSE reg when ISA >= SSE2, same as 'Y' case 'i': case 't': case '2': if (!Subtarget.hasSSE2()) return CW_Invalid; break; } // Fall through (handle "Y" constraint). LLVM_FALLTHROUGH; } case 'v': if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()) weight = CW_Register; LLVM_FALLTHROUGH; case 'x': if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256())) weight = CW_Register; break; case 'k': // Enable conditional vector operations using %k<#> registers. if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) weight = CW_Register; break; case 'I': if (ConstantInt *C = dyn_cast(info.CallOperandVal)) { if (C->getZExtValue() <= 31) weight = CW_Constant; } break; case 'J': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 63) weight = CW_Constant; } break; case 'K': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) weight = CW_Constant; } break; case 'L': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) weight = CW_Constant; } break; case 'M': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 3) weight = CW_Constant; } break; case 'N': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xff) weight = CW_Constant; } break; case 'G': case 'C': if (isa(CallOperandVal)) { weight = CW_Constant; } break; case 'e': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80000000LL) && (C->getSExtValue() <= 0x7fffffffLL)) weight = CW_Constant; } break; case 'Z': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xffffffff) weight = CW_Constant; } break; } return weight; } /// Try to replace an X constraint, which matches anything, with another that /// has more specific requirements based on the type of the corresponding /// operand. const char *X86TargetLowering:: LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { if (Subtarget.hasSSE2()) return "Y"; if (Subtarget.hasSSE1()) return "x"; } return TargetLowering::LowerXConstraint(ConstraintVT); } /// Lower the specified operand into the Ops vector. /// If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result; // Only support length 1 constraints for now. if (Constraint.length() > 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; case 'I': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 31) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'J': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 63) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'K': if (ConstantSDNode *C = dyn_cast(Op)) { if (isInt<8>(C->getSExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'L': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) { Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'M': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 3) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'N': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 255) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'O': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 127) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'e': { // 32-bit signed value if (ConstantSDNode *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); break; } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. } return; } case 'Z': { // 32-bit unsigned value if (ConstantSDNode *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getZExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. return; } case 'i': { // Literal immediates are always ok. if (ConstantSDNode *CST = dyn_cast(Op)) { // Widen to 64 bits here to get it sign extended. Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64); break; } // In any sort of PIC mode addresses need to be computed at runtime by // adding in a register or some sort of table lookup. These can't // be used as immediates. if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) return; // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. GlobalAddressSDNode *GA = nullptr; int64_t Offset = 0; // Match either (GA), (GA+C), (GA+C1+C2), etc. while (1) { if ((GA = dyn_cast(Op))) { Offset += GA->getOffset(); break; } else if (Op.getOpcode() == ISD::ADD) { if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Offset += C->getZExtValue(); Op = Op.getOperand(0); continue; } } else if (Op.getOpcode() == ISD::SUB) { if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Offset += -C->getZExtValue(); Op = Op.getOperand(0); continue; } } // Otherwise, this isn't something we can handle, reject it. return; } const GlobalValue *GV = GA->getGlobal(); // If we require an extra load to get this address, as in PIC mode, we // can't accept it. if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV))) return; Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), GA->getValueType(0), Offset); break; } } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } /// Check if \p RC is a general purpose register class. /// I.e., GR* or one of their variant. static bool isGRClass(const TargetRegisterClass &RC) { return RC.hasSuperClassEq(&X86::GR8RegClass) || RC.hasSuperClassEq(&X86::GR16RegClass) || RC.hasSuperClassEq(&X86::GR32RegClass) || RC.hasSuperClassEq(&X86::GR64RegClass) || RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass); } /// Check if \p RC is a vector register class. /// I.e., FR* / VR* or one of their variant. static bool isFRClass(const TargetRegisterClass &RC) { return RC.hasSuperClassEq(&X86::FR32XRegClass) || RC.hasSuperClassEq(&X86::FR64XRegClass) || RC.hasSuperClassEq(&X86::VR128XRegClass) || RC.hasSuperClassEq(&X86::VR256XRegClass) || RC.hasSuperClassEq(&X86::VR512RegClass); } std::pair X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. if (Constraint.size() == 1) { // GCC Constraint Letters switch (Constraint[0]) { default: break; // TODO: Slight differences here in allocation order and leaving // RIP in the class. Do they matter any more here than they do // in the normal allocation? case 'k': if (Subtarget.hasAVX512()) { // Only supported in AVX512 or later. switch (VT.SimpleTy) { default: break; case MVT::i32: return std::make_pair(0U, &X86::VK32RegClass); case MVT::i16: return std::make_pair(0U, &X86::VK16RegClass); case MVT::i8: return std::make_pair(0U, &X86::VK8RegClass); case MVT::i1: return std::make_pair(0U, &X86::VK1RegClass); case MVT::i64: return std::make_pair(0U, &X86::VK64RegClass); } } break; case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget.is64Bit()) { if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i64 || VT == MVT::f64) return std::make_pair(0U, &X86::GR64RegClass); break; } LLVM_FALLTHROUGH; // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32_ABCDRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_ABCDRegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); if (VT == MVT::i64) return std::make_pair(0U, &X86::GR64_ABCDRegClass); break; case 'r': // GENERAL_REGS case 'l': // INDEX_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit()) return std::make_pair(0U, &X86::GR32RegClass); return std::make_pair(0U, &X86::GR64RegClass); case 'R': // LEGACY_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_NOREXRegClass); if (VT == MVT::i32 || !Subtarget.is64Bit()) return std::make_pair(0U, &X86::GR32_NOREXRegClass); return std::make_pair(0U, &X86::GR64_NOREXRegClass); case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the // value to the correct fpstack register class. if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP32RegClass); if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP64RegClass); return std::make_pair(0U, &X86::RFP80RegClass); case 'y': // MMX_REGS if MMX allowed. if (!Subtarget.hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'Y': // SSE_REGS if SSE2 allowed if (!Subtarget.hasSSE2()) break; LLVM_FALLTHROUGH; case 'v': case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget.hasSSE1()) break; bool VConstraint = (Constraint[0] == 'v'); switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f32: case MVT::i32: if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR32XRegClass); return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: case MVT::i64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR64XRegClass); return std::make_pair(0U, &X86::FR64RegClass); // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR128XRegClass); return std::make_pair(0U, &X86::VR128RegClass); // AVX types. case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR256XRegClass); return std::make_pair(0U, &X86::VR256RegClass); case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: return std::make_pair(0U, &X86::VR512RegClass); } break; } } else if (Constraint.size() == 2 && Constraint[0] == 'Y') { switch (Constraint[1]) { default: break; case 'i': case 't': case '2': return getRegForInlineAsmConstraint(TRI, "Y", VT); case 'm': if (!Subtarget.hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'z': case '0': if (!Subtarget.hasSSE1()) break; return std::make_pair(X86::XMM0, &X86::VR128RegClass); case 'k': // This register class doesn't allocate k0 for masked vector operation. if (Subtarget.hasAVX512()) { // Only supported in AVX512. switch (VT.SimpleTy) { default: break; case MVT::i32: return std::make_pair(0U, &X86::VK32WMRegClass); case MVT::i16: return std::make_pair(0U, &X86::VK16WMRegClass); case MVT::i8: return std::make_pair(0U, &X86::VK8WMRegClass); case MVT::i1: return std::make_pair(0U, &X86::VK1WMRegClass); case MVT::i64: return std::make_pair(0U, &X86::VK64WMRegClass); } } break; } } // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && Constraint[3] == '(' && (Constraint[4] >= '0' && Constraint[4] <= '7') && Constraint[5] == ')' && Constraint[6] == '}') { Res.first = X86::FP0+Constraint[4]-'0'; Res.second = &X86::RFP80RegClass; return Res; } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_lower(Constraint)) { Res.first = X86::FP0; Res.second = &X86::RFP80RegClass; return Res; } // flags -> EFLAGS if (StringRef("{flags}").equals_lower(Constraint)) { Res.first = X86::EFLAGS; Res.second = &X86::CCRRegClass; return Res; } // 'A' means [ER]AX + [ER]DX. if (Constraint == "A") { if (Subtarget.is64Bit()) { Res.first = X86::RAX; Res.second = &X86::GR64_ADRegClass; } else { assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"); Res.first = X86::EAX; Res.second = &X86::GR32_ADRegClass; } return Res; } return Res; } // Otherwise, check to see if this is a register class of the wrong value // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. // MVT::Other is used to specify clobber names. if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other) return Res; // Correct type already, nothing to do. // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should // return "eax". This should even work for things like getting 64bit integer // registers when given an f64 type. const TargetRegisterClass *Class = Res.second; // The generic code will match the first register class that contains the // given register. Thus, based on the ordering of the tablegened file, // the "plain" GR classes might not come first. // Therefore, use a helper method. if (isGRClass(*Class)) { unsigned Size = VT.getSizeInBits(); if (Size == 1) Size = 8; unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { bool is64Bit = Subtarget.is64Bit(); const TargetRegisterClass *RC = Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass) : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass) : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass) : &X86::GR64RegClass; if (RC->contains(DestReg)) Res = std::make_pair(DestReg, RC); } else { // No register found/type mismatch. Res.first = 0; Res.second = nullptr; } } else if (isFRClass(*Class)) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can // find, ignoring the required type. // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) Res.second = &X86::FR64RegClass; else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT)) Res.second = &X86::VR128RegClass; else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT)) Res.second = &X86::VR256RegClass; else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) Res.second = &X86::VR512RegClass; else { // Type mismatch and not a clobber: Return an error; Res.first = 0; Res.second = nullptr; } } return Res; } int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // Scaling factors are not free at all. // An indexed folded instruction, i.e., inst (reg1, reg2, scale), // will take 2 allocations in the out of order engine instead of 1 // for plain addressing mode, i.e. inst (reg1). // E.g., // vaddps (%rsi,%drx), %ymm0, %ymm1 // Requires two allocations (one for the load, one for the computation) // whereas: // vaddps (%rsi), %ymm0, %ymm1 // Requires just 1 allocation, i.e., freeing allocations for other operations // and having less micro operations to execute. // // For some X86 architectures, this is even worse because for instance for // stores, the complex addressing mode forces the instruction to use the // "load" ports instead of the dedicated "store" port. // E.g., on Haswell: // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 // as soon as we use a second register. return AM.Scale != 0; return -1; } bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller // than the alternative sequence. // The exception to this is vector division. Since x86 doesn't have vector // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { if (!Subtarget.is64Bit()) return; // Update IsSplitCSR in X86MachineFunctionInfo. X86MachineFunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void X86TargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (X86::GR64RegClass.contains(*I)) RC = &X86::GR64RegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } bool X86TargetLowering::supportSwiftError() const { return Subtarget.is64Bit(); } /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { // If the function specifically requests stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); // Generally, if we aren't on Windows, the platform ABI does not include // support for stack probes, so don't emit them. if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO()) return ""; // We need a stack probe to conform to the Windows ABI. Choose the right // symbol. if (Subtarget.is64Bit()) return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk"; return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; } Index: vendor/llvm/dist/lib/Target/X86/X86InstrFPStack.td =================================================================== --- vendor/llvm/dist/lib/Target/X86/X86InstrFPStack.td (revision 327151) +++ vendor/llvm/dist/lib/Target/X86/X86InstrFPStack.td (revision 327152) @@ -1,763 +1,761 @@ //===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file describes the X86 x87 FPU instruction set, defining the // instructions, and properties of the instructions which are needed for code // generation, machine code emission, and analysis. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // FPStack specific DAG Nodes. //===----------------------------------------------------------------------===// def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, SDTCisVT<1, f80>]>; def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, [SDNPHasChain, SDNPInGlue, SDNPMayStore, SDNPMemOperand]>; def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, SDNPMemOperand]>; def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>; def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, [SDNPHasChain, SDNPMayStore, SDNPSideEffect, SDNPMemOperand]>; //===----------------------------------------------------------------------===// // FPStack pattern fragments //===----------------------------------------------------------------------===// def fpimm0 : FPImmLeaf; def fpimmneg0 : FPImmLeaf; def fpimm1 : FPImmLeaf; def fpimmneg1 : FPImmLeaf; // Some 'special' instructions - expanded after instruction selection. let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src), [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src), [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>; def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src), [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>; def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src), [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>; def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src), [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>; def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src), [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>; def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src), [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>; def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src), [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>; def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src), [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>; } // All FP Stack operations are represented with four instructions here. The // first three instructions, generated by the instruction selector, use "RFP32" // "RFP64" or "RFP80" registers: traditional register files to reference 32-bit, // 64-bit or 80-bit floating point values. These sizes apply to the values, // not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be // copied to each other without losing information. These instructions are all // pseudo instructions and use the "_Fp" suffix. // In some cases there are additional variants with a mixture of different // register sizes. // The second instruction is defined with FPI, which is the actual instruction // emitted by the assembler. These use "RST" registers, although frequently // the actual register(s) used are implicit. These are always 80 bits. // The FP stackifier pass converts one to the other after register allocation // occurs. // // Note that the FpI instruction should have instruction selection info (e.g. // a pattern) and the FPI instruction should have emission info (e.g. opcode // encoding and asm printing info). // FpIf32, FpIf64 - Floating Point Pseudo Instruction template. // f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1. // f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2. // f80 instructions cannot use SSE and use neither of these. class FpIf32 pattern, InstrItinClass itin = NoItinerary> : FpI_, Requires<[FPStackf32]>; class FpIf64 pattern, InstrItinClass itin = NoItinerary> : FpI_, Requires<[FPStackf64]>; // Factoring for arithmetic. multiclass FPBinary_rr { // Register op register -> register // These are separated out because they have no reversed form. def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP, [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>; def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP, [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>; def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>; } // The FopST0 series are not included here because of the irregularities // in where the 'r' goes in assembly output. // These instructions cannot address 80-bit memory. multiclass FPBinary { +let mayLoad = 1, hasSideEffects = 1 in { // ST(0) = ST(0) + [mem] def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP32:$dst, (OpNode RFP32:$src1, (loadf32 addr:$src2))), (set RFP32:$dst, (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>; def _Fp64m : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, [!if(Forward, (set RFP64:$dst, (OpNode RFP64:$src1, (loadf64 addr:$src2))), (set RFP64:$dst, (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>; def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP64:$dst, (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))), (set RFP64:$dst, (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>; def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP80:$dst, (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))), (set RFP80:$dst, (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>; def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, [!if(Forward, (set RFP80:$dst, (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), (set RFP80:$dst, (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; -let mayLoad = 1 in def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), !strconcat("f", asmstring, "{s}\t$src")>; -let mayLoad = 1 in def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), !strconcat("f", asmstring, "{l}\t$src")>; // ST(0) = ST(0) + [memint] def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, [!if(Forward, (set RFP32:$dst, (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), (set RFP32:$dst, (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP32:$dst, (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), (set RFP32:$dst, (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, [!if(Forward, (set RFP64:$dst, (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), (set RFP64:$dst, (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP64:$dst, (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), (set RFP64:$dst, (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), OneArgFPRW, [!if(Forward, (set RFP80:$dst, (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), (set RFP80:$dst, (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), OneArgFPRW, [!if(Forward, (set RFP80:$dst, (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), (set RFP80:$dst, (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; -let mayLoad = 1 in def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), !strconcat("fi", asmstring, "{s}\t$src")>; -let mayLoad = 1 in def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), !strconcat("fi", asmstring, "{l}\t$src")>; +} // mayLoad = 1, hasSideEffects = 1 } let Defs = [FPSW] in { // FPBinary_rr just defines pseudo-instructions, no need to set a scheduling // resources. let hasNoSchedulingInfo = 1 in { defm ADD : FPBinary_rr; defm SUB : FPBinary_rr; defm MUL : FPBinary_rr; defm DIV : FPBinary_rr; } // Sets the scheduling resources for the actual NAME#_Fm defintions. let SchedRW = [WriteFAddLd] in { defm ADD : FPBinary; defm SUB : FPBinary; defm SUBR: FPBinary; } let SchedRW = [WriteFMulLd] in { defm MUL : FPBinary; } let SchedRW = [WriteFDivLd] in { defm DIV : FPBinary; defm DIVR: FPBinary; } } // Defs = [FPSW] class FPST0rInst : FPI<0xD8, fp, (outs), (ins RST:$op), asm>; class FPrST0Inst : FPI<0xDC, fp, (outs), (ins RST:$op), asm>; class FPrST0PInst : FPI<0xDE, fp, (outs), (ins RST:$op), asm>; // NOTE: GAS and apparently all other AT&T style assemblers have a broken notion // of some of the 'reverse' forms of the fsub and fdiv instructions. As such, // we have to put some 'r's in and take them out of weird places. let SchedRW = [WriteFAdd] in { def ADD_FST0r : FPST0rInst ; def ADD_FrST0 : FPrST0Inst ; def ADD_FPrST0 : FPrST0PInst; def SUBR_FST0r : FPST0rInst ; def SUB_FrST0 : FPrST0Inst ; def SUB_FPrST0 : FPrST0PInst; def SUB_FST0r : FPST0rInst ; def SUBR_FrST0 : FPrST0Inst ; def SUBR_FPrST0 : FPrST0PInst; def COM_FST0r : FPST0rInst ; def COMP_FST0r : FPST0rInst ; } // SchedRW let SchedRW = [WriteFMul] in { def MUL_FST0r : FPST0rInst ; def MUL_FrST0 : FPrST0Inst ; def MUL_FPrST0 : FPrST0PInst; } // SchedRW let SchedRW = [WriteFDiv] in { def DIVR_FST0r : FPST0rInst ; def DIV_FrST0 : FPrST0Inst ; def DIV_FPrST0 : FPrST0PInst; def DIV_FST0r : FPST0rInst ; def DIVR_FrST0 : FPrST0Inst ; def DIVR_FPrST0 : FPrST0PInst; } // SchedRW // Unary operations. multiclass FPUnary { def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW, [(set RFP32:$dst, (OpNode RFP32:$src))], itin>; def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW, [(set RFP64:$dst, (OpNode RFP64:$src))], itin>; def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW, [(set RFP80:$dst, (OpNode RFP80:$src))], itin>; def _F : FPI<0xD9, fp, (outs), (ins), asmstring, itin>; } let Defs = [FPSW] in { let SchedRW = [WriteVecLogic] in { defm CHS : FPUnary; defm ABS : FPUnary; } let SchedRW = [WriteFSqrt] in defm SQRT: FPUnary; let SchedRW = [WriteMicrocoded] in { defm SIN : FPUnary; defm COS : FPUnary; } let SchedRW = [WriteFAdd] in { let hasSideEffects = 0 in { def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>; def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>; def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; } // hasSideEffects def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst", IIC_FCOMI>; } // SchedRW } // Defs = [FPSW] // Versions of FP instructions that take a single memory operand. Added for the // disassembler; remove as they are included with patterns elsewhere. let SchedRW = [WriteFAddLd] in { def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">; def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">; def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">; def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">; def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">; } // SchedRW let SchedRW = [WriteMicrocoded] in { def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">; def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">; def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">; def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">; def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">; def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">; def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">; } // SchedRW // Floating point cmovs. class FpIf32CMov pattern, InstrItinClass itin> : FpI_, Requires<[FPStackf32, HasCMov]>; class FpIf64CMov pattern, InstrItinClass itin> : FpI_, Requires<[FPStackf64, HasCMov]>; multiclass FPCMov { def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), CondMovFP, [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, cc, EFLAGS))], IIC_FCMOV>; def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), CondMovFP, [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, cc, EFLAGS))], IIC_FCMOV>; def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), CondMovFP, [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, cc, EFLAGS))], IIC_FCMOV>, Requires<[HasCMov]>; } let Defs = [FPSW] in { let SchedRW = [WriteFAdd] in { let Uses = [EFLAGS], Constraints = "$src1 = $dst" in { defm CMOVB : FPCMov; defm CMOVBE : FPCMov; defm CMOVE : FPCMov; defm CMOVP : FPCMov; defm CMOVNB : FPCMov; defm CMOVNBE: FPCMov; defm CMOVNE : FPCMov; defm CMOVNP : FPCMov; } // Uses = [EFLAGS], Constraints = "$src1 = $dst" let Predicates = [HasCMov] in { // These are not factored because there's no clean way to pass DA/DB. def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op), "fcmovb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op), "fcmovbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op), "fcmove\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op), "fcmovu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op), "fcmovnb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op), "fcmovnbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op), "fcmovne\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op), "fcmovnu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; } // Predicates = [HasCMov] } // SchedRW // Floating point loads & stores. let SchedRW = [WriteLoad] in { let canFoldAsLoad = 1 in { def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP32:$dst, (loadf32 addr:$src))]>; let isReMaterializable = 1 in def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP, [(set RFP64:$dst, (loadf64 addr:$src))]>; def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP, [(set RFP80:$dst, (loadf80 addr:$src))]>; } // canFoldAsLoad def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>; def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP, [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>; def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>; def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP, [(set RFP32:$dst, (X86fild addr:$src, i16))]>; def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP, [(set RFP32:$dst, (X86fild addr:$src, i32))]>; def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP, [(set RFP32:$dst, (X86fild addr:$src, i64))]>; def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP, [(set RFP64:$dst, (X86fild addr:$src, i16))]>; def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP, [(set RFP64:$dst, (X86fild addr:$src, i32))]>; def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP, [(set RFP64:$dst, (X86fild addr:$src, i64))]>; def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP, [(set RFP80:$dst, (X86fild addr:$src, i16))]>; def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP, [(set RFP80:$dst, (X86fild addr:$src, i32))]>; def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP, [(set RFP80:$dst, (X86fild addr:$src, i64))]>; } // SchedRW let SchedRW = [WriteStore] in { def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, [(store RFP32:$src, addr:$op)]>; def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, [(truncstoref32 RFP64:$src, addr:$op)]>; def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, [(store RFP64:$src, addr:$op)]>; def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, [(truncstoref32 RFP80:$src, addr:$op)]>; def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, [(truncstoref64 RFP80:$src, addr:$op)]>; // FST does not support 80-bit memory target; FSTP must be used. let mayStore = 1, hasSideEffects = 0 in { def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>; def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>; def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>; def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>; def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>; } // mayStore def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP, [(store RFP80:$src, addr:$op)]>; let mayStore = 1, hasSideEffects = 0 in { def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>; def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>; def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>; def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>; def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>; def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>; def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>; def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; } // mayStore } // SchedRW let mayLoad = 1, SchedRW = [WriteLoad] in { def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src", IIC_FLD>; def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src", IIC_FLD>; def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src", IIC_FLD80>; def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src", IIC_FILD>; def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src", IIC_FILD>; def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src", IIC_FILD>; } let mayStore = 1, SchedRW = [WriteStore] in { def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst", IIC_FST>; def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst", IIC_FST>; def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst", IIC_FST>; def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst", IIC_FST>; def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst", IIC_FST80>; def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst", IIC_FIST>; def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst", IIC_FIST>; def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst", IIC_FIST>; def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst", IIC_FIST>; def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst", IIC_FIST>; } // FISTTP requires SSE3 even though it's a FPStack op. let Predicates = [HasSSE3], SchedRW = [WriteStore] in { def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, [(X86fp_to_i16mem RFP32:$src, addr:$op)]>; def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, [(X86fp_to_i32mem RFP32:$src, addr:$op)]>; def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, [(X86fp_to_i64mem RFP32:$src, addr:$op)]>; def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, [(X86fp_to_i16mem RFP64:$src, addr:$op)]>; def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, [(X86fp_to_i32mem RFP64:$src, addr:$op)]>; def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, [(X86fp_to_i64mem RFP64:$src, addr:$op)]>; def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, [(X86fp_to_i16mem RFP80:$src, addr:$op)]>; def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, [(X86fp_to_i32mem RFP80:$src, addr:$op)]>; def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, [(X86fp_to_i64mem RFP80:$src, addr:$op)]>; } // Predicates = [HasSSE3] let mayStore = 1, SchedRW = [WriteStore] in { def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst", IIC_FST>; def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst", IIC_FST>; def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst", IIC_FST>; } // FP Stack manipulation instructions. let SchedRW = [WriteMove] in { def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>; def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>; def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>; def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>; } // Floating point constant loads. let isReMaterializable = 1, SchedRW = [WriteZero] in { def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, [(set RFP32:$dst, fpimm0)]>; def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, [(set RFP32:$dst, fpimm1)]>; def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, [(set RFP64:$dst, fpimm0)]>; def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, [(set RFP64:$dst, fpimm1)]>; def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, [(set RFP80:$dst, fpimm0)]>; def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, [(set RFP80:$dst, fpimm1)]>; } let SchedRW = [WriteZero] in { def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>; def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>; } // Floating point compares. let SchedRW = [WriteFAdd] in { def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>; def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>; def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>; } // SchedRW } // Defs = [FPSW] let SchedRW = [WriteFAdd] in { // CC = ST(0) cmp ST(i) let Defs = [EFLAGS, FPSW] in { def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>; def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>; def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>; } let Defs = [FPSW], Uses = [ST0] in { def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i) (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>; def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>; def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop (outs), (ins), "fucompp", IIC_FUCOM>; } let Defs = [EFLAGS, FPSW], Uses = [ST0] in { def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i) (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>; def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>; } let Defs = [EFLAGS, FPSW] in { def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg), "fcomi\t$reg", IIC_FCOMI>; def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg), "fcompi\t$reg", IIC_FCOMI>; } } // SchedRW // Floating point flag ops. let SchedRW = [WriteALU] in { let Defs = [AX], Uses = [FPSW] in def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags (outs), (ins), "fnstsw\t{%ax|ax}", [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>; let Defs = [FPSW] in def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world (outs), (ins i16mem:$dst), "fnstcw\t$dst", [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>; } // SchedRW let Defs = [FPSW], mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>, Sched<[WriteLoad]>; // FPU control instructions let SchedRW = [WriteMicrocoded] in { let Defs = [FPSW] in { def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>; def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), "ffree\t$reg", IIC_FFREE>; def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg), "ffreep\t$reg", IIC_FFREE>; // Clear exceptions def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>; } // Defs = [FPSW] } // SchedRW // Operandless floating-point instructions for the disassembler. let SchedRW = [WriteMicrocoded] in { def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>; let Defs = [FPSW] in { def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>; def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>; def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>; def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>; def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>; def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>; def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>; def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>; def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>; def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>; def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>; def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>; def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>; def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>; def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>; def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>; def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>; def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>; def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>; def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>; def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; } // Defs = [FPSW] def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB, Requires<[HasFXSR]>; def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], IIC_FXSAVE>, TB, Requires<[HasFXSR, In64BitMode]>; def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB, Requires<[HasFXSR]>; def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], IIC_FXRSTOR>, TB, Requires<[HasFXSR, In64BitMode]>; } // SchedRW //===----------------------------------------------------------------------===// // Non-Instruction Patterns //===----------------------------------------------------------------------===// // Required for RET of f32 / f64 / f80 values. def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>; def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>; def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>; // Required for CALL which return f32 / f64 / f80 values. def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, RFP64:$src)>; def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, RFP80:$src)>; def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, RFP80:$src)>; def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, RFP80:$src)>; // Floating point constant -0.0 and -1.0 def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>; def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>; def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>; def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>; def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>; def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>; // Used to conv. i64 to f64 since there isn't a SSE version. def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>; // FP extensions map onto simple pseudo-value conversions if they are to/from // the FP stack. def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, Requires<[FPStackf32]>; def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, Requires<[FPStackf32]>; def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, Requires<[FPStackf64]>; // FP truncations map onto simple pseudo-value conversions if they are to/from // the FP stack. We have validated that only value-preserving truncations make // it through isel. def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, Requires<[FPStackf32]>; def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, Requires<[FPStackf32]>; def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, Requires<[FPStackf64]>; Index: vendor/llvm/dist/lib/Target/X86/X86InstrInfo.td =================================================================== --- vendor/llvm/dist/lib/Target/X86/X86InstrInfo.td (revision 327151) +++ vendor/llvm/dist/lib/Target/X86/X86InstrInfo.td (revision 327152) @@ -1,3345 +1,3348 @@ //===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file describes the X86 instruction set, defining the instructions, and // properties of the instructions which are needed for code generation, machine // code emission, and analysis. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // X86 specific DAG Nodes. // def SDTIntShiftDOp: SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>]>; def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; //def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; // Unary and binary operator instructions that set EFLAGS as a side-effect. def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, [SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisVT<1, i32>]>; def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>, SDTCisVT<1, i32>]>; // SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>, SDTCisVT<1, i32>, SDTCisVT<4, i32>]>; // RES1, RES2, FLAGS = op LHS, RHS def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>, SDTCisVT<1, i32>]>; def SDTX86BrCond : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; def SDTX86SetCC : SDTypeProfile<1, 2, [SDTCisVT<0, i8>, SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; def SDTX86SetCC_C : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>; def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>; def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, SDTCisVT<2, i8>]>; def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3, [SDTCisVT<0, i64>, SDTCisPtrTy<1>, SDTCisVT<2, i64>, SDTCisVT<3, i64>]>; def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisInt<2>]>; def SDTLockUnaryArithWithFlags : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>; def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, SDTCisVT<1, iPTR>, SDTCisVT<2, iPTR>]>; def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; def SDTX86Void : SDTypeProfile<0, 0, []>; def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, [SDNPHasChain,SDNPSideEffect]>; def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, [SDNPHasChain]>; def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>; def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>; def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>; def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, [SDNPHasChain]>; def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>; def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>; def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, [SDNPHasChain, SDNPSideEffect]>; def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand, [SDNPHasChain, SDNPSideEffect]>; def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG", SDTX86caspairSaveEbx8, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG", SDTX86caspairSaveRbx16, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue]>; def X86vastart_save_xmm_regs : SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", SDT_X86VASTART_SAVE_XMM_REGS, [SDNPHasChain, SDNPVariadic]>; def X86vaarg64 : SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; def X86callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, [SDNPHasChain, SDNPOutGlue]>; def X86callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def X86call : SDNode<"X86ISD::CALL", SDT_X86Call, [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>; def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad]>; def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisInt<1>]>>; def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, [SDNPHasChain]>; def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, [SDNPHasChain, SDNPSideEffect]>; def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPSideEffect]>; def X86eh_sjlj_setup_dispatch : SDNode<"X86ISD::EH_SJLJ_SETUP_DISPATCH", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect]>; def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>; def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags, [SDNPCommutative]>; def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>; def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>; def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>; def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>; def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86lock_add : SDNode<"X86ISD::LADD", SDTLockBinaryArithWithFlags, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86lock_sub : SDNode<"X86ISD::LSUB", SDTLockBinaryArithWithFlags, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86lock_or : SDNode<"X86ISD::LOR", SDTLockBinaryArithWithFlags, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86lock_xor : SDNode<"X86ISD::LXOR", SDTLockBinaryArithWithFlags, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86lock_inc : SDNode<"X86ISD::LINC", SDTLockUnaryArithWithFlags, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86lock_dec : SDNode<"X86ISD::LDEC", SDTLockUnaryArithWithFlags, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA, [SDNPHasChain, SDNPOutGlue]>; def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, [SDNPHasChain]>; def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def X86lwpins : SDNode<"X86ISD::LWPINS", SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPSideEffect]>; //===----------------------------------------------------------------------===// // X86 Operand Definitions. // // A version of ptr_rc which excludes SP, ESP, and RSP. This is used for // the index operand of an address, to conform to x86 encoding restrictions. def ptr_rc_nosp : PointerLikeRegClass<1>; // *mem - Operand definitions for the funky X86 addressing mode operands. // def X86MemAsmOperand : AsmOperandClass { let Name = "Mem"; } let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in { def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; } def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; } def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; } def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; } def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; } def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; } def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; } def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; } // Gather mem operands def X86Mem64_RC128Operand : AsmOperandClass { let Name = "Mem64_RC128"; } def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; } def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; } def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; } def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; } def X86Mem64_RC128XOperand : AsmOperandClass { let Name = "Mem64_RC128X"; } def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; } def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; } def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; } def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; } def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; } def X86Mem256_RC512Operand : AsmOperandClass { let Name = "Mem256_RC512"; } def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; } } def X86AbsMemAsmOperand : AsmOperandClass { let Name = "AbsMem"; let SuperClasses = [X86MemAsmOperand]; } class X86MemOperand : Operand { let PrintMethod = printMethod; let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); let ParserMatchClass = parserMatchClass; let OperandType = "OPERAND_MEMORY"; } // Gather mem operands class X86VMemOperand : X86MemOperand { let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG); } def anymem : X86MemOperand<"printanymem">; def opaque32mem : X86MemOperand<"printopaquemem">; def opaque48mem : X86MemOperand<"printopaquemem">; def opaque80mem : X86MemOperand<"printopaquemem">; def opaque512mem : X86MemOperand<"printopaquemem">; def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>; def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>; def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>; def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>; def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>; def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>; def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>; def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>; def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>; def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>; def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>; def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>; def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>; def v512mem : X86VMemOperand; // Gather mem operands def vx64mem : X86VMemOperand; def vx128mem : X86VMemOperand; def vx256mem : X86VMemOperand; def vy128mem : X86VMemOperand; def vy256mem : X86VMemOperand; def vx64xmem : X86VMemOperand; def vx128xmem : X86VMemOperand; def vx256xmem : X86VMemOperand; def vy128xmem : X86VMemOperand; def vy256xmem : X86VMemOperand; def vy512mem : X86VMemOperand; def vz256xmem : X86VMemOperand; def vz512mem : X86VMemOperand; // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead // of a plain GPR, so that it doesn't potentially require a REX prefix. def ptr_rc_norex : PointerLikeRegClass<2>; def ptr_rc_norex_nosp : PointerLikeRegClass<3>; def i8mem_NOREX : Operand { let PrintMethod = "printi8mem"; let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, SEGMENT_REG); let ParserMatchClass = X86Mem8AsmOperand; let OperandType = "OPERAND_MEMORY"; } // GPRs available for tailcall. // It represents GR32_TC, GR64_TC or GR64_TCW64. def ptr_rc_tailcall : PointerLikeRegClass<4>; // Special i32mem for addresses of load folding tail calls. These are not // allowed to use callee-saved registers since they must be scheduled // after callee-saved register are popped. def i32mem_TC : Operand { let PrintMethod = "printi32mem"; let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, i32imm, SEGMENT_REG); let ParserMatchClass = X86Mem32AsmOperand; let OperandType = "OPERAND_MEMORY"; } // Special i64mem for addresses of load folding tail calls. These are not // allowed to use callee-saved registers since they must be scheduled // after callee-saved register are popped. def i64mem_TC : Operand { let PrintMethod = "printi64mem"; let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, i32imm, SEGMENT_REG); let ParserMatchClass = X86Mem64AsmOperand; let OperandType = "OPERAND_MEMORY"; } let OperandType = "OPERAND_PCREL", ParserMatchClass = X86AbsMemAsmOperand, PrintMethod = "printPCRelImm" in { def i32imm_pcrel : Operand; def i16imm_pcrel : Operand; // Branch targets have OtherVT type and print as pc-relative values. def brtarget : Operand; def brtarget8 : Operand; } // Special parser to detect 16-bit mode to select 16-bit displacement. def X86AbsMem16AsmOperand : AsmOperandClass { let Name = "AbsMem16"; let RenderMethod = "addAbsMemOperands"; let SuperClasses = [X86AbsMemAsmOperand]; } // Branch targets have OtherVT type and print as pc-relative values. let OperandType = "OPERAND_PCREL", PrintMethod = "printPCRelImm" in { let ParserMatchClass = X86AbsMem16AsmOperand in def brtarget16 : Operand; let ParserMatchClass = X86AbsMemAsmOperand in def brtarget32 : Operand; } let RenderMethod = "addSrcIdxOperands" in { def X86SrcIdx8Operand : AsmOperandClass { let Name = "SrcIdx8"; let SuperClasses = [X86Mem8AsmOperand]; } def X86SrcIdx16Operand : AsmOperandClass { let Name = "SrcIdx16"; let SuperClasses = [X86Mem16AsmOperand]; } def X86SrcIdx32Operand : AsmOperandClass { let Name = "SrcIdx32"; let SuperClasses = [X86Mem32AsmOperand]; } def X86SrcIdx64Operand : AsmOperandClass { let Name = "SrcIdx64"; let SuperClasses = [X86Mem64AsmOperand]; } } // RenderMethod = "addSrcIdxOperands" let RenderMethod = "addDstIdxOperands" in { def X86DstIdx8Operand : AsmOperandClass { let Name = "DstIdx8"; let SuperClasses = [X86Mem8AsmOperand]; } def X86DstIdx16Operand : AsmOperandClass { let Name = "DstIdx16"; let SuperClasses = [X86Mem16AsmOperand]; } def X86DstIdx32Operand : AsmOperandClass { let Name = "DstIdx32"; let SuperClasses = [X86Mem32AsmOperand]; } def X86DstIdx64Operand : AsmOperandClass { let Name = "DstIdx64"; let SuperClasses = [X86Mem64AsmOperand]; } } // RenderMethod = "addDstIdxOperands" let RenderMethod = "addMemOffsOperands" in { def X86MemOffs16_8AsmOperand : AsmOperandClass { let Name = "MemOffs16_8"; let SuperClasses = [X86Mem8AsmOperand]; } def X86MemOffs16_16AsmOperand : AsmOperandClass { let Name = "MemOffs16_16"; let SuperClasses = [X86Mem16AsmOperand]; } def X86MemOffs16_32AsmOperand : AsmOperandClass { let Name = "MemOffs16_32"; let SuperClasses = [X86Mem32AsmOperand]; } def X86MemOffs32_8AsmOperand : AsmOperandClass { let Name = "MemOffs32_8"; let SuperClasses = [X86Mem8AsmOperand]; } def X86MemOffs32_16AsmOperand : AsmOperandClass { let Name = "MemOffs32_16"; let SuperClasses = [X86Mem16AsmOperand]; } def X86MemOffs32_32AsmOperand : AsmOperandClass { let Name = "MemOffs32_32"; let SuperClasses = [X86Mem32AsmOperand]; } def X86MemOffs32_64AsmOperand : AsmOperandClass { let Name = "MemOffs32_64"; let SuperClasses = [X86Mem64AsmOperand]; } def X86MemOffs64_8AsmOperand : AsmOperandClass { let Name = "MemOffs64_8"; let SuperClasses = [X86Mem8AsmOperand]; } def X86MemOffs64_16AsmOperand : AsmOperandClass { let Name = "MemOffs64_16"; let SuperClasses = [X86Mem16AsmOperand]; } def X86MemOffs64_32AsmOperand : AsmOperandClass { let Name = "MemOffs64_32"; let SuperClasses = [X86Mem32AsmOperand]; } def X86MemOffs64_64AsmOperand : AsmOperandClass { let Name = "MemOffs64_64"; let SuperClasses = [X86Mem64AsmOperand]; } } // RenderMethod = "addMemOffsOperands" class X86SrcIdxOperand : X86MemOperand { let MIOperandInfo = (ops ptr_rc, SEGMENT_REG); } class X86DstIdxOperand : X86MemOperand { let MIOperandInfo = (ops ptr_rc); } def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>; def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>; def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>; def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>; def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>; def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>; def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>; def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>; class X86MemOffsOperand : X86MemOperand { let MIOperandInfo = (ops immOperand, SEGMENT_REG); } def offset16_8 : X86MemOffsOperand; def offset16_16 : X86MemOffsOperand; def offset16_32 : X86MemOffsOperand; def offset32_8 : X86MemOffsOperand; def offset32_16 : X86MemOffsOperand; def offset32_32 : X86MemOffsOperand; def offset32_64 : X86MemOffsOperand; def offset64_8 : X86MemOffsOperand; def offset64_16 : X86MemOffsOperand; def offset64_32 : X86MemOffsOperand; def offset64_64 : X86MemOffsOperand; def SSECC : Operand { let PrintMethod = "printSSEAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; } def AVXCC : Operand { let PrintMethod = "printSSEAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; } def AVX512ICC : Operand { let PrintMethod = "printSSEAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; } def XOPCC : Operand { let PrintMethod = "printXOPCC"; let OperandType = "OPERAND_IMMEDIATE"; } class ImmSExtAsmOperandClass : AsmOperandClass { let SuperClasses = [ImmAsmOperand]; let RenderMethod = "addImmOperands"; } def X86GR32orGR64AsmOperand : AsmOperandClass { let Name = "GR32orGR64"; } def GR32orGR64 : RegisterOperand { let ParserMatchClass = X86GR32orGR64AsmOperand; } def AVX512RCOperand : AsmOperandClass { let Name = "AVX512RC"; } def AVX512RC : Operand { let PrintMethod = "printRoundingControl"; let OperandType = "OPERAND_IMMEDIATE"; let ParserMatchClass = AVX512RCOperand; } // Sign-extended immediate classes. We don't need to define the full lattice // here because there is no instruction with an ambiguity between ImmSExti64i32 // and ImmSExti32i8. // // The strange ranges come from the fact that the assembler always works with // 64-bit immediates, but for a 16-bit target value we want to accept both "-1" // (which will be a -1ULL), and "0xFF" (-1 in 16-bits). // [0, 0x7FFFFFFF] | // [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF] def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass { let Name = "ImmSExti64i32"; } // [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] | // [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass { let Name = "ImmSExti16i8"; let SuperClasses = [ImmSExti64i32AsmOperand]; } // [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] | // [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass { let Name = "ImmSExti32i8"; } // [0, 0x0000007F] | // [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { let Name = "ImmSExti64i8"; let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand, ImmSExti64i32AsmOperand]; } // Unsigned immediate used by SSE/AVX instructions // [0, 0xFF] // [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] def ImmUnsignedi8AsmOperand : AsmOperandClass { let Name = "ImmUnsignedi8"; let RenderMethod = "addImmOperands"; } // A couple of more descriptive operand definitions. // 16-bits but only 8 bits are significant. def i16i8imm : Operand { let ParserMatchClass = ImmSExti16i8AsmOperand; let OperandType = "OPERAND_IMMEDIATE"; } // 32-bits but only 8 bits are significant. def i32i8imm : Operand { let ParserMatchClass = ImmSExti32i8AsmOperand; let OperandType = "OPERAND_IMMEDIATE"; } // 64-bits but only 32 bits are significant. def i64i32imm : Operand { let ParserMatchClass = ImmSExti64i32AsmOperand; let OperandType = "OPERAND_IMMEDIATE"; } // 64-bits but only 8 bits are significant. def i64i8imm : Operand { let ParserMatchClass = ImmSExti64i8AsmOperand; let OperandType = "OPERAND_IMMEDIATE"; } // Unsigned 8-bit immediate used by SSE/AVX instructions. def u8imm : Operand { let PrintMethod = "printU8Imm"; let ParserMatchClass = ImmUnsignedi8AsmOperand; let OperandType = "OPERAND_IMMEDIATE"; } // 32-bit immediate but only 8-bits are significant and they are unsigned. // Used by some SSE/AVX instructions that use intrinsics. def i32u8imm : Operand { let PrintMethod = "printU8Imm"; let ParserMatchClass = ImmUnsignedi8AsmOperand; let OperandType = "OPERAND_IMMEDIATE"; } // 64-bits but only 32 bits are significant, and those bits are treated as being // pc relative. def i64i32imm_pcrel : Operand { let PrintMethod = "printPCRelImm"; let ParserMatchClass = X86AbsMemAsmOperand; let OperandType = "OPERAND_PCREL"; } def lea64_32mem : Operand { let PrintMethod = "printanymem"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG); let ParserMatchClass = X86MemAsmOperand; } // Memory operands that use 64-bit pointers in both ILP32 and LP64. def lea64mem : Operand { let PrintMethod = "printanymem"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG); let ParserMatchClass = X86MemAsmOperand; } //===----------------------------------------------------------------------===// // X86 Complex Pattern Definitions. // // Define X86-specific addressing mode. def addr : ComplexPattern; def lea32addr : ComplexPattern; // In 64-bit mode 32-bit LEAs can use RIP-relative addressing. def lea64_32addr : ComplexPattern; def tls32addr : ComplexPattern; def tls32baseaddr : ComplexPattern; def lea64addr : ComplexPattern; def tls64addr : ComplexPattern; def tls64baseaddr : ComplexPattern; def vectoraddr : ComplexPattern; // A relocatable immediate is either an immediate operand or an operand that can // be relocated by the linker to an immediate, such as a regular symbol in // non-PIC code. def relocImm : ComplexPattern; //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. def TruePredicate : Predicate<"true">; def HasCMov : Predicate<"Subtarget->hasCMov()">; def NoCMov : Predicate<"!Subtarget->hasCMov()">; def HasMMX : Predicate<"Subtarget->hasMMX()">; def Has3DNow : Predicate<"Subtarget->has3DNow()">; def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">; def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">; def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def NoAVX : Predicate<"!Subtarget->hasAVX()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; def HasAVX512 : Predicate<"Subtarget->hasAVX512()">, AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">; def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; def HasCDI : Predicate<"Subtarget->hasCDI()">, AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">; def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">, AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">; def HasPFI : Predicate<"Subtarget->hasPFI()">, AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">; def HasERI : Predicate<"Subtarget->hasERI()">, AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">; def HasDQI : Predicate<"Subtarget->hasDQI()">, AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">; def NoDQI : Predicate<"!Subtarget->hasDQI()">; def HasBWI : Predicate<"Subtarget->hasBWI()">, AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">; def NoBWI : Predicate<"!Subtarget->hasBWI()">; def HasVLX : Predicate<"Subtarget->hasVLX()">, AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">; def NoVLX : Predicate<"!Subtarget->hasVLX()">; def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; def PKU : Predicate<"Subtarget->hasPKU()">; -def HasVNNI : Predicate<"Subtarget->hasVNNI()">; +def HasVNNI : Predicate<"Subtarget->hasVNNI()">, + AssemblerPredicate<"FeatureVNNI", "AVX-512 VNNI ISA">; -def HasBITALG : Predicate<"Subtarget->hasBITALG()">; +def HasBITALG : Predicate<"Subtarget->hasBITALG()">, + AssemblerPredicate<"FeatureBITALG", "AVX-512 BITALG ISA">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; def HasVAES : Predicate<"Subtarget->hasVAES()">; def NoVLX_Or_NoVAES : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVAES()">; def HasFXSR : Predicate<"Subtarget->hasFXSR()">; def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">; def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">; def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; def NoVLX_Or_NoVPCLMULQDQ : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVPCLMULQDQ()">; def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">; def HasGFNI : Predicate<"Subtarget->hasGFNI()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">; def HasXOP : Predicate<"Subtarget->hasXOP()">; def HasTBM : Predicate<"Subtarget->hasTBM()">; def NoTBM : Predicate<"!Subtarget->hasTBM()">; def HasLWP : Predicate<"Subtarget->hasLWP()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; def HasF16C : Predicate<"Subtarget->hasF16C()">; def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; def HasBMI : Predicate<"Subtarget->hasBMI()">; def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">; def HasVBMI : Predicate<"Subtarget->hasVBMI()">, AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">; -def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">; +def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">, + AssemblerPredicate<"FeatureVBMI2", "AVX-512 VBMI2 ISA">; def HasIFMA : Predicate<"Subtarget->hasIFMA()">, AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">; def HasRTM : Predicate<"Subtarget->hasRTM()">; def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">; def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">; def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">; def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasMPX : Predicate<"Subtarget->hasMPX()">; def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">; def HasIBT : Predicate<"Subtarget->hasIBT()">; def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; def HasCLWB : Predicate<"Subtarget->hasCLWB()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, AssemblerPredicate<"Mode64Bit", "64-bit mode">; def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">; def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">; def In16BitMode : Predicate<"Subtarget->is16Bit()">, AssemblerPredicate<"Mode16Bit", "16-bit mode">; def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">; def In32BitMode : Predicate<"Subtarget->is32Bit()">, AssemblerPredicate<"Mode32Bit", "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" "Subtarget->getFrameLowering()->hasFP(*MF)"> { let RecomputePerFunction = 1; } def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" "TM.getCodeModel() == CodeModel::Kernel">; def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; // We could compute these on a per-module basis but doing so requires accessing // the Function object through the Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { def OptForSize : Predicate<"MF->getFunction().optForSize()">; def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">; def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">; def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " "MF->getFunction().optForSize()">; } def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; def FavorMemIndirectCall : Predicate<"!Subtarget->slowTwoMemOps()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; def HasERMSB : Predicate<"Subtarget->hasERMSB()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. // include "X86InstrFormats.td" //===----------------------------------------------------------------------===// // Pattern fragments. // // X86 specific condition code. These correspond to CondCode in // X86InstrInfo.h. They must be kept in synch. def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ def X86_COND_NO : PatLeaf<(i8 10)>; def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO def X86_COND_NS : PatLeaf<(i8 12)>; def X86_COND_O : PatLeaf<(i8 13)>; def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE def X86_COND_S : PatLeaf<(i8 15)>; def i16immSExt8 : ImmLeaf(Imm); }]>; def i32immSExt8 : ImmLeaf(Imm); }]>; def i64immSExt8 : ImmLeaf(Imm); }]>; def i64immSExt32 : ImmLeaf(Imm); }]>; // FIXME: Ideally we would just replace the above i*immSExt* matchers with // relocImm-based matchers, but then FastISel would be unable to use them. def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{ return isSExtRelocImm<8>(N); }]>; def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{ return isSExtRelocImm<32>(N); }]>; // If we have multiple users of an immediate, it's much smaller to reuse // the register, rather than encode the immediate in every instruction. // This has the risk of increasing register pressure from stretched live // ranges, however, the immediates should be trivial to rematerialize by // the RA in the event of high register pressure. // TODO : This is currently enabled for stores and binary ops. There are more // cases for which this can be enabled, though this catches the bulk of the // issues. // TODO2 : This should really also be enabled under O2, but there's currently // an issue with RA where we don't pull the constants into their users // when we rematerialize them. I'll follow-up on enabling O2 after we fix that // issue. // TODO3 : This is currently limited to single basic blocks (DAG creation // pulls block immediates to the top and merges them if necessary). // Eventually, it would be nice to allow ConstantHoisting to merge constants // globally for potentially added savings. // def imm8_su : PatLeaf<(i8 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; def imm16_su : PatLeaf<(i16 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; def imm32_su : PatLeaf<(i32 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; def i64immSExt32_su : PatLeaf<(i64immSExt32), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; def i64immSExt8_su : PatLeaf<(i64immSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit // unsigned field. def i64immZExt32 : ImmLeaf(Imm); }]>; def i64immZExt32SExt8 : ImmLeaf(Imm) && isInt<8>(static_cast(Imm)); }]>; // Helper fragments for loads. // It's always safe to treat a anyext i16 load as a i32 load if the i16 is // known to be 32-bit aligned or better. Ditto for i8 to i16. def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{ LoadSDNode *LD = cast(N); ISD::LoadExtType ExtType = LD->getExtensionType(); if (ExtType == ISD::NON_EXTLOAD) return true; if (ExtType == ISD::EXTLOAD) return LD->getAlignment() >= 2 && !LD->isVolatile(); return false; }]>; def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{ LoadSDNode *LD = cast(N); ISD::LoadExtType ExtType = LD->getExtensionType(); if (ExtType == ISD::EXTLOAD) return LD->getAlignment() >= 2 && !LD->isVolatile(); return false; }]>; def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ LoadSDNode *LD = cast(N); ISD::LoadExtType ExtType = LD->getExtensionType(); if (ExtType == ISD::NON_EXTLOAD) return true; if (ExtType == ISD::EXTLOAD) return LD->getAlignment() >= 4 && !LD->isVolatile(); return false; }]>; def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>; def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>; def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>; def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>; def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>; def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>; def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>; def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>; def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>; def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>; def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>; def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>; def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>; def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>; def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>; def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>; def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>; def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>; def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>; def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>; def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>; def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>; def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>; def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>; def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>; // An 'and' node with a single use. def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ return N->hasOneUse(); }]>; // An 'srl' node with a single use. def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{ return N->hasOneUse(); }]>; // An 'trunc' node with a single use. def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ return N->hasOneUse(); }]>; //===----------------------------------------------------------------------===// // Instruction list. // // Nop let hasSideEffects = 0, SchedRW = [WriteZero] in { def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>; def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero), "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16; def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero), "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32; def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero), "nop{q}\t$zero", [], IIC_NOP>, TB, Requires<[In64BitMode]>; // Also allow register so we can assemble/disassemble def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero), "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16; def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero), "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32; def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero), "nop{q}\t$zero", [], IIC_NOP>, TB, Requires<[In64BitMode]>; } // Constructing a stack frame. def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>; let SchedRW = [WriteALU] in { let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in def LEAVE : I<0xC9, RawFrm, (outs), (ins), "leave", [], IIC_LEAVE>, Requires<[Not64BitMode]>; let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in def LEAVE64 : I<0xC9, RawFrm, (outs), (ins), "leave", [], IIC_LEAVE>, Requires<[In64BitMode]>; } // SchedRW //===----------------------------------------------------------------------===// // Miscellaneous Instructions. // let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1, SchedRW = [WriteSystem] in def Int_eh_sjlj_setup_dispatch : PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>; let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in { let mayLoad = 1, SchedRW = [WriteLoad] in { def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], IIC_POP_REG16>, OpSize16; def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>; def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], IIC_POP_REG>, OpSize16; def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>; } // mayLoad, SchedRW let mayStore = 1, mayLoad = 1, SchedRW = [WriteRMW] in { def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [], IIC_POP_MEM>, OpSize16; def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [], IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>; } // mayStore, mayLoad, WriteRMW let mayStore = 1, SchedRW = [WriteStore] in { def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[], IIC_PUSH_REG>, OpSize16; def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[], IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[], IIC_PUSH_REG>, OpSize16; def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[], IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm), "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16; def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16; def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm), "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[Not64BitMode]>; def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[Not64BitMode]>; } // mayStore, SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], IIC_PUSH_MEM>, OpSize16; def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>; } // mayLoad, mayStore, SchedRW } let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, SchedRW = [WriteRMW], Defs = [ESP] in { let Uses = [ESP] in def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins), [(set GR32:$dst, (int_x86_flags_read_u32))]>, Requires<[Not64BitMode]>; let Uses = [RSP] in def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins), [(set GR64:$dst, (int_x86_flags_read_u64))]>, Requires<[In64BitMode]>; } let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, SchedRW = [WriteRMW] in { let Defs = [ESP, EFLAGS], Uses = [ESP] in def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src), [(int_x86_flags_write_u32 GR32:$src)]>, Requires<[Not64BitMode]>; let Defs = [RSP, EFLAGS], Uses = [RSP] in def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src), [(int_x86_flags_write_u64 GR64:$src)]>, Requires<[In64BitMode]>; } let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0, SchedRW = [WriteLoad] in { def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize16; def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, OpSize32, Requires<[Not64BitMode]>; } let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0, SchedRW = [WriteStore] in { def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>, OpSize16; def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>, OpSize32, Requires<[Not64BitMode]>; } let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in { let mayLoad = 1, SchedRW = [WriteLoad] in { def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>; def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>; } // mayLoad, SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [], IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>; let mayStore = 1, SchedRW = [WriteStore] in { def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; } // mayStore, SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>; } // mayLoad, mayStore, SchedRW } let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in { def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[In64BitMode]>; def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[In64BitMode]>; } let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>, OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>; let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>, OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>; let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in { def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>, OpSize32, Requires<[Not64BitMode]>; def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>, OpSize16, Requires<[Not64BitMode]>; } let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>, OpSize32, Requires<[Not64BitMode]>; def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>, OpSize16, Requires<[Not64BitMode]>; } let Constraints = "$src = $dst", SchedRW = [WriteALU] in { // GR32 = bswap GR32 def BSWAP32r : I<0xC8, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), "bswap{l}\t$dst", [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB; def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), "bswap{q}\t$dst", [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB; } // Constraints = "$src = $dst", SchedRW // Bit scan instructions. let Defs = [EFLAGS] in { def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsf{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))], IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>; def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsf{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))], IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>; def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsf{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>; def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsf{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))], IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>; def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsf{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))], IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>; def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsf{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))], IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>; def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsr{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>; def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsr{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))], IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>; def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsr{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>; def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsr{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))], IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>; def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsr{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>; def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsr{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))], IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>; } // Defs = [EFLAGS] let SchedRW = [WriteMicrocoded] in { // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in { def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src), "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>; def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src), "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16; def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src), "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32; def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>; } // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst), "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>; let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst), "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16; let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst), "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32; let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst), "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>; // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst), "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>; let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst), "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16; let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst), "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32; let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst), "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>; // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in { def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src), "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>; def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src), "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16; def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src), "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32; def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>; } } // SchedRW //===----------------------------------------------------------------------===// // Move Instructions. // let SchedRW = [WriteMove] in { let hasSideEffects = 0 in { def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16; def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32; def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; } let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", [(set GR8:$dst, imm:$src)], IIC_MOV>; def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16; def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, relocImm:$src)], IIC_MOV>, OpSize32; def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>; } let isReMaterializable = 1 in { def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), "movabs{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, relocImm:$src)], IIC_MOV>; } // Longer forms that use a ModR/M byte. Needed for disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, FoldGenData<"MOV8ri">; def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16, FoldGenData<"MOV16ri">; def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32, FoldGenData<"MOV32ri">; } } // SchedRW let SchedRW = [WriteStore] in { def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>; def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>, Requires<[In64BitMode]>; } // SchedRW let hasSideEffects = 0 in { /// Memory offset versions of moves. The immediate is an address mode sized /// offset from the segment base. let SchedRW = [WriteALU] in { let mayLoad = 1 in { let Defs = [AL] in def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src), "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize32; let Defs = [AX] in def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src), "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize16, AdSize32; let Defs = [EAX] in def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src), "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, OpSize32, AdSize32; let Defs = [RAX] in def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src), "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>, AdSize32; let Defs = [AL] in def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src), "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16; let Defs = [AX] in def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src), "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize16, AdSize16; let Defs = [EAX] in def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src), "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, AdSize16, OpSize32; } let mayStore = 1 in { let Uses = [AL] in def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst), "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32; let Uses = [AX] in def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst), "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize16, AdSize32; let Uses = [EAX] in def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst), "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, OpSize32, AdSize32; let Uses = [RAX] in def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst), "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>, AdSize32; let Uses = [AL] in def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst), "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16; let Uses = [AX] in def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst), "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize16, AdSize16; let Uses = [EAX] in def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst), "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, OpSize32, AdSize16; } } // These forms all have full 64-bit absolute addresses in their instructions // and use the movabs mnemonic to indicate this specific form. let mayLoad = 1 in { let Defs = [AL] in def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src), "movabs{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize64; let Defs = [AX] in def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src), "movabs{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize16, AdSize64; let Defs = [EAX] in def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src), "movabs{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, OpSize32, AdSize64; let Defs = [RAX] in def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src), "movabs{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>, AdSize64; } let mayStore = 1 in { let Uses = [AL] in def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst), "movabs{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize64; let Uses = [AX] in def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst), "movabs{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize16, AdSize64; let Uses = [EAX] in def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst), "movabs{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, OpSize32, AdSize64; let Uses = [RAX] in def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst), "movabs{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>, AdSize64; } } // hasSideEffects = 0 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, FoldGenData<"MOV8rr">; def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16, FoldGenData<"MOV16rr">; def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32, FoldGenData<"MOV32rr">; def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, FoldGenData<"MOV64rr">; } let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), "mov{b}\t{$src, $dst|$dst, $src}", [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>; def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "mov{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16; def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "mov{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32; def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>; } let SchedRW = [WriteStore] in { def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src), "mov{b}\t{$src, $dst|$dst, $src}", [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>; def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "mov{w}\t{$src, $dst|$dst, $src}", [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16; def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "mov{l}\t{$src, $dst|$dst, $src}", [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32; def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>; } // SchedRW // Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so // that they can be used for copying and storing h registers, which can't be // encoded when a REX prefix is present. let isCodeGenOnly = 1 in { let hasSideEffects = 0 in def MOV8rr_NOREX : I<0x88, MRMDestReg, (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>, Sched<[WriteMove]>; let mayStore = 1, hasSideEffects = 0 in def MOV8mr_NOREX : I<0x88, MRMDestMem, (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV_MEM>, Sched<[WriteStore]>; let mayLoad = 1, hasSideEffects = 0, canFoldAsLoad = 1, isReMaterializable = 1 in def MOV8rm_NOREX : I<0x8A, MRMSrcMem, (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV_MEM>, Sched<[WriteLoad]>; } // Condition code ops, incl. set if equal/not equal/... let SchedRW = [WriteALU] in { let Defs = [EFLAGS], Uses = [AH] in def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", [(set EFLAGS, (X86sahf AH))], IIC_AHF>, Requires<[HasLAHFSAHF]>; let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], IIC_AHF>, // AH = flags Requires<[HasLAHFSAHF]>; } // SchedRW //===----------------------------------------------------------------------===// // Bit tests instructions: BT, BTS, BTR, BTC. let Defs = [EFLAGS] in { let SchedRW = [WriteALU] in { def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>, OpSize16, TB, NotMemoryFoldable; def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, OpSize32, TB, NotMemoryFoldable; def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB, NotMemoryFoldable; } // SchedRW // Unlike with the register+register form, the memory+register form of the // bt instruction does not ignore the high bits of the index. From ISel's // perspective, this is pretty bizarre. Make these instructions disassembly // only for now. These instructions are also slow on modern CPUs so that's // another reason to avoid generating them. let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in { def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BT_MR >, OpSize16, TB, NotMemoryFoldable; def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BT_MR >, OpSize32, TB, NotMemoryFoldable; def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BT_MR >, TB, NotMemoryFoldable; } let SchedRW = [WriteALU] in { def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))], IIC_BT_RI>, OpSize16, TB; def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))], IIC_BT_RI>, OpSize32, TB; def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))], IIC_BT_RI>, TB; } // SchedRW // Note that these instructions aren't slow because that only applies when the // other operand is in a register. When it's an immediate, bt is still fast. let SchedRW = [WriteALU] in { def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2)) ], IIC_BT_MI>, OpSize16, TB; def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2)) ], IIC_BT_MI>, OpSize32, TB; def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi64 addr:$src1), i64immSExt8:$src2))], IIC_BT_MI>, TB, Requires<[In64BitMode]>; } // SchedRW let hasSideEffects = 0 in { let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize16, TB, NotMemoryFoldable; def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize32, TB, NotMemoryFoldable; def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB, NotMemoryFoldable; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize16, TB, NotMemoryFoldable; def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize32, TB, NotMemoryFoldable; def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB, NotMemoryFoldable; } let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize16, TB; def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize32, TB; def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize16, TB; def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize32, TB; def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB, Requires<[In64BitMode]>; } let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize16, TB, NotMemoryFoldable; def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize32, TB, NotMemoryFoldable; def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, NotMemoryFoldable; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize16, TB, NotMemoryFoldable; def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize32, TB, NotMemoryFoldable; def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB, NotMemoryFoldable; } let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize16, TB; def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize32, TB; def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize16, TB; def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize32, TB; def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB, Requires<[In64BitMode]>; } let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize16, TB, NotMemoryFoldable; def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, OpSize32, TB, NotMemoryFoldable; def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB, NotMemoryFoldable; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize16, TB, NotMemoryFoldable; def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, OpSize32, TB, NotMemoryFoldable; def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB, NotMemoryFoldable; } let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize16, TB; def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize32, TB; def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize16, TB; def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize32, TB; def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB, Requires<[In64BitMode]>; } } // hasSideEffects = 0 } // Defs = [EFLAGS] //===----------------------------------------------------------------------===// // Atomic support // // Atomic swap. These are just normal xchg instructions. But since a memory // operand is referenced, the atomicity is ensured. multiclass ATOMIC_SWAP opc8, bits<8> opc, string mnemonic, string frag, InstrItinClass itin> { let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in { def NAME#8rm : I(frag # "_8") addr:$ptr, GR8:$val))], itin>; def NAME#16rm : I(frag # "_16") addr:$ptr, GR16:$val))], itin>, OpSize16; def NAME#32rm : I(frag # "_32") addr:$ptr, GR32:$val))], itin>, OpSize32; def NAME#64rm : RI(frag # "_64") addr:$ptr, GR64:$val))], itin>; } } defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>; // Swap between registers. let SchedRW = [WriteALU] in { let Constraints = "$val = $dst" in { def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src), "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src), "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize16; def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src), "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize32; def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src), "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; } // Swap between EAX and other registers. let Uses = [AX], Defs = [AX] in def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src), "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16; let Uses = [EAX], Defs = [EAX] in def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src), "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, OpSize32, Requires<[Not64BitMode]>; let Uses = [EAX], Defs = [EAX] in // Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding. // xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP. def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src), "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, OpSize32, Requires<[In64BitMode]>; let Uses = [RAX], Defs = [RAX] in def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src), "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>; } // SchedRW let SchedRW = [WriteALU] in { def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB, OpSize16; def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB, OpSize32; def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB, OpSize16; def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB, OpSize32; def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; } let SchedRW = [WriteALU] in { def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_REG8>, TB; def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), "cmpxchg{w}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_REG>, TB, OpSize16; def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), "cmpxchg{l}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_REG>, TB, OpSize32; def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), "cmpxchg{q}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_REG>, TB; } // SchedRW let SchedRW = [WriteALULd, WriteRMW] in { let mayLoad = 1, mayStore = 1 in { def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_MEM8>, TB; def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "cmpxchg{w}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_MEM>, TB, OpSize16; def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "cmpxchg{l}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_MEM>, TB, OpSize32; def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "cmpxchg{q}\t{$src, $dst|$dst, $src}", [], IIC_CMPXCHG_MEM>, TB; } let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB; let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>, TB, Requires<[HasCmpxchg16b, In64BitMode]>; } // SchedRW // Lock instruction prefix let SchedRW = [WriteMicrocoded] in def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>; let SchedRW = [WriteNop] in { // Rex64 instruction prefix def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", [], IIC_NOP>, Requires<[In64BitMode]>; // Data16 instruction prefix def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", [], IIC_NOP>, Requires<[Not16BitMode]>; // Data instruction prefix def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", [], IIC_NOP>, Requires<[In16BitMode]>; } // SchedRW // Repeat string operation instruction prefixes // These use the DF flag in the EFLAGS register to inc or dec ECX let Defs = [ECX], Uses = [ECX,EFLAGS], SchedRW = [WriteMicrocoded] in { // Repeat (used with INS, OUTS, MOVS, LODS and STOS) def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>; // Repeat while not equal (used with CMPS and SCAS) def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; } // String manipulation instructions let SchedRW = [WriteMicrocoded] in { // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src), "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>; let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src), "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16; let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src), "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32; let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src), "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>; } let SchedRW = [WriteSystem] in { // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in { def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src), "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>; def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src), "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16; def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src), "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32; } // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in { def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst), "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>; def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst), "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize16; def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst), "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32; } } // Flag instructions let SchedRW = [WriteALU] in { def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>; def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>; def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>; def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>; def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>; def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>; def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>; def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; } // Table lookup instructions let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>, Sched<[WriteLoad]>; let SchedRW = [WriteMicrocoded] in { // ASCII Adjust After Addition let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>, Requires<[Not64BitMode]>; // ASCII Adjust AX Before Division let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>; // ASCII Adjust AX After Multiply let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>; // ASCII Adjust AL After Subtraction - sets let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>, Requires<[Not64BitMode]>; // Decimal Adjust AL after Addition let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>, Requires<[Not64BitMode]>; // Decimal Adjust AL after Subtraction let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>, Requires<[Not64BitMode]>; } // SchedRW let SchedRW = [WriteSystem] in { // Check Array Index Against Bounds def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16, Requires<[Not64BitMode]>; def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32, Requires<[Not64BitMode]>; // Adjust RPL Field of Segment Selector def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>, Requires<[Not64BitMode]>; let mayStore = 1 in def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>, Requires<[Not64BitMode]>; } // SchedRW //===----------------------------------------------------------------------===// // MOVBE Instructions // let Predicates = [HasMOVBE] in { let SchedRW = [WriteALULd] in { def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "movbe{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>, OpSize16, T8PS; def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "movbe{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>, OpSize32, T8PS; def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "movbe{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>, T8PS; } let SchedRW = [WriteStore] in { def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "movbe{w}\t{$src, $dst|$dst, $src}", [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>, OpSize16, T8PS; def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movbe{l}\t{$src, $dst|$dst, $src}", [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>, OpSize32, T8PS; def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "movbe{q}\t{$src, $dst|$dst, $src}", [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>, T8PS; } } //===----------------------------------------------------------------------===// // RDRAND Instruction // let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in { def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins), "rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, OpSize16, PS; def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins), "rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, OpSize32, PS; def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins), "rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, PS; } //===----------------------------------------------------------------------===// // RDSEED Instruction // let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in { def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, OpSize16, PS; def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, OpSize32, PS; def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, PS; } //===----------------------------------------------------------------------===// // LZCNT Instruction // let Predicates = [HasLZCNT], Defs = [EFLAGS] in { def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "lzcnt{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)], IIC_LZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>; def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "lzcnt{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (ctlz (loadi16 addr:$src))), (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize16, Sched<[WriteIMulLd]>; def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "lzcnt{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)], IIC_LZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>; def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "lzcnt{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (ctlz (loadi32 addr:$src))), (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize32, Sched<[WriteIMulLd]>; def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "lzcnt{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)], IIC_LZCNT_RR>, XS, Sched<[WriteIMul]>; def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "lzcnt{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (ctlz (loadi64 addr:$src))), (implicit EFLAGS)], IIC_LZCNT_RM>, XS, Sched<[WriteIMulLd]>; } //===----------------------------------------------------------------------===// // BMI Instructions // let Predicates = [HasBMI], Defs = [EFLAGS] in { def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "tzcnt{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)], IIC_TZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>; def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "tzcnt{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (cttz (loadi16 addr:$src))), (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize16, Sched<[WriteIMulLd]>; def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "tzcnt{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)], IIC_TZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>; def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "tzcnt{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (cttz (loadi32 addr:$src))), (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize32, Sched<[WriteIMulLd]>; def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "tzcnt{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)], IIC_TZCNT_RR>, XS, Sched<[WriteIMul]>; def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "tzcnt{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (cttz (loadi64 addr:$src))), (implicit EFLAGS)], IIC_TZCNT_RM>, XS, Sched<[WriteIMulLd]>; } multiclass bmi_bls { let hasSideEffects = 0 in { def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), [], IIC_UNARY_REG>, T8PS, VEX_4V, Sched<[WriteALU]>; let mayLoad = 1 in def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), [], IIC_UNARY_MEM>, T8PS, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>; } } let Predicates = [HasBMI], Defs = [EFLAGS] in { defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>; defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W; defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>; defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W; defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>; defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W; } //===----------------------------------------------------------------------===// // Pattern fragments to auto generate BMI instructions. //===----------------------------------------------------------------------===// let Predicates = [HasBMI] in { // FIXME: patterns for the load versions are not implemented def : Pat<(and GR32:$src, (add GR32:$src, -1)), (BLSR32rr GR32:$src)>; def : Pat<(and GR64:$src, (add GR64:$src, -1)), (BLSR64rr GR64:$src)>; def : Pat<(xor GR32:$src, (add GR32:$src, -1)), (BLSMSK32rr GR32:$src)>; def : Pat<(xor GR64:$src, (add GR64:$src, -1)), (BLSMSK64rr GR64:$src)>; def : Pat<(and GR32:$src, (ineg GR32:$src)), (BLSI32rr GR32:$src)>; def : Pat<(and GR64:$src, (ineg GR64:$src)), (BLSI64rr GR64:$src)>; } multiclass bmi_bextr_bzhi opc, string mnemonic, RegisterClass RC, X86MemOperand x86memop, Intrinsic Int, PatFrag ld_frag> { def rr : I, T8PS, VEX, Sched<[WriteALU]>; def rm : I, T8PS, VEX, Sched<[WriteALULd, ReadAfterLd]>; } let Predicates = [HasBMI], Defs = [EFLAGS] in { defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem, int_x86_bmi_bextr_32, loadi32>; defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem, int_x86_bmi_bextr_64, loadi64>, VEX_W; } let Predicates = [HasBMI2], Defs = [EFLAGS] in { defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem, int_x86_bmi_bzhi_32, loadi32>; defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem, int_x86_bmi_bzhi_64, loadi64>, VEX_W; } def CountTrailingOnes : SDNodeXFormgetZExtValue()), SDLoc(N)); }]>; def BEXTRMaskXForm : SDNodeXFormgetZExtValue()); return getI32Imm(Length << 8, SDLoc(N)); }]>; def AndMask64 : ImmLeaf UINT32_MAX; }]>; // Use BEXTR for 64-bit 'and' with large immediate 'mask'. let Predicates = [HasBMI, NoBMI2, NoTBM] in { def : Pat<(and GR64:$src, AndMask64:$mask), (BEXTR64rr GR64:$src, (SUBREG_TO_REG (i64 0), (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>; def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), (BEXTR64rm addr:$src, (SUBREG_TO_REG (i64 0), (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>; } // Use BZHI for 64-bit 'and' with large immediate 'mask'. let Predicates = [HasBMI2, NoTBM] in { def : Pat<(and GR64:$src, AndMask64:$mask), (BZHI64rr GR64:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), (BZHI64rm addr:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; } let Predicates = [HasBMI2] in { def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)), (BZHI32rr GR32:$src, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)), (BZHI32rm addr:$src, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)), (BZHI64rr GR64:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)), (BZHI64rm addr:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; // x & (-1 >> (32 - y)) def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), (BZHI32rr GR32:$src, GR32:$lz)>; def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), (BZHI32rm addr:$src, GR32:$lz)>; // x & (-1 >> (64 - y)) def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), (BZHI64rr GR64:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), (BZHI64rm addr:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; // x << (32 - y) >> (32 - y) def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))), (i8 (trunc (sub 32, GR32:$lz)))), (BZHI32rr GR32:$src, GR32:$lz)>; def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))), (i8 (trunc (sub 32, GR32:$lz)))), (BZHI32rm addr:$src, GR32:$lz)>; // x << (64 - y) >> (64 - y) def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))), (i8 (trunc (sub 64, GR32:$lz)))), (BZHI64rr GR64:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))), (i8 (trunc (sub 64, GR32:$lz)))), (BZHI64rm addr:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; } // HasBMI2 multiclass bmi_pdep_pext { def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (Int RC:$src1, RC:$src2))], IIC_BIN_NONMEM>, VEX_4V, Sched<[WriteALU]>; def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], IIC_BIN_MEM>, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>; } let Predicates = [HasBMI2] in { defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem, int_x86_bmi_pdep_32, loadi32>, T8XD; defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem, int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W; defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem, int_x86_bmi_pext_32, loadi32>, T8XS; defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem, int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W; } //===----------------------------------------------------------------------===// // TBM Instructions // let Predicates = [HasTBM], Defs = [EFLAGS] in { multiclass tbm_ternary_imm_intr opc, RegisterClass RC, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, Intrinsic Int, Operand immtype, SDPatternOperator immoperator> { def ri : Ii32, XOP, XOPA, Sched<[WriteALU]>; def mi : Ii32, XOP, XOPA, Sched<[WriteALULd, ReadAfterLd]>; } defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32, int_x86_tbm_bextri_u32, i32imm, imm>; let ImmT = Imm32S in defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64, int_x86_tbm_bextri_u64, i64i32imm, i64immSExt32>, VEX_W; multiclass tbm_binary_rm opc, Format FormReg, Format FormMem, RegisterClass RC, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag> { let hasSideEffects = 0 in { def rr : I, XOP_4V, XOP9, Sched<[WriteALU]>; let mayLoad = 1 in def rm : I, XOP_4V, XOP9, Sched<[WriteALULd, ReadAfterLd]>; } } multiclass tbm_binary_intr opc, string OpcodeStr, Format FormReg, Format FormMem> { defm NAME#32 : tbm_binary_rm; defm NAME#64 : tbm_binary_rm, VEX_W; } defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>; defm BLCI : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>; defm BLCIC : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>; defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>; defm BLCS : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>; defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>; defm BLSIC : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>; defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>; defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>; } // HasTBM, EFLAGS // Use BEXTRI for 64-bit 'and' with large immediate 'mask'. let Predicates = [HasTBM] in { def : Pat<(and GR64:$src, AndMask64:$mask), (BEXTRI64ri GR64:$src, (BEXTRMaskXForm imm:$mask))>; def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), (BEXTRI64mi addr:$src, (BEXTRMaskXForm imm:$mask))>; } //===----------------------------------------------------------------------===// // Lightweight Profiling Instructions let Predicates = [HasLWP], SchedRW = [WriteSystem] in { def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src", [(int_x86_llwpcb GR32:$src)], IIC_LWP>, XOP, XOP9; def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst", [(set GR32:$dst, (int_x86_slwpcb))], IIC_LWP>, XOP, XOP9; def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src", [(int_x86_llwpcb GR64:$src)], IIC_LWP>, XOP, XOP9, VEX_W; def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst", [(set GR64:$dst, (int_x86_slwpcb))], IIC_LWP>, XOP, XOP9, VEX_W; multiclass lwpins_intr { def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))], IIC_LWP>, XOP_4V, XOPA; let mayLoad = 1 in def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))], IIC_LWP>, XOP_4V, XOPA; } let Defs = [EFLAGS] in { defm LWPINS32 : lwpins_intr; defm LWPINS64 : lwpins_intr, VEX_W; } // EFLAGS multiclass lwpval_intr { def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", [(Int RC:$src0, GR32:$src1, imm:$cntl)], IIC_LWP>, XOP_4V, XOPA; let mayLoad = 1 in def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)], IIC_LWP>, XOP_4V, XOPA; } defm LWPVAL32 : lwpval_intr; defm LWPVAL64 : lwpval_intr, VEX_W; } // HasLWP, SchedRW //===----------------------------------------------------------------------===// // MONITORX/MWAITX Instructions // let SchedRW = [ WriteSystem ] in { let usesCustomInserter = 1 in { def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>, Requires<[ HasMWAITX ]>; } let Uses = [ EAX, ECX, EDX ] in { def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], IIC_SSE_MONITORX>, TB, Requires<[ HasMWAITX ]>; } let Uses = [ ECX, EAX, EBX ] in { def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [(int_x86_mwaitx ECX, EAX, EBX)], IIC_SSE_MWAITX>, TB, Requires<[ HasMWAITX ]>; } } // SchedRW def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>, Requires<[ Not64BitMode ]>; def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>, Requires<[ In64BitMode ]>; def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>, Requires<[ Not64BitMode ]>; def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, Requires<[ In64BitMode ]>; //===----------------------------------------------------------------------===// // CLZERO Instruction // let SchedRW = [WriteSystem] in { let Uses = [EAX] in def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", [], IIC_SSE_CLZERO>, TB, Requires<[HasCLZERO]>; let usesCustomInserter = 1 in { def CLZERO : PseudoI<(outs), (ins i32mem:$src1), [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>; } } // SchedRW def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>; def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // Pattern fragments to auto generate TBM instructions. //===----------------------------------------------------------------------===// let Predicates = [HasTBM] in { // FIXME: patterns for the load versions are not implemented def : Pat<(and GR32:$src, (add GR32:$src, 1)), (BLCFILL32rr GR32:$src)>; def : Pat<(and GR64:$src, (add GR64:$src, 1)), (BLCFILL64rr GR64:$src)>; def : Pat<(or GR32:$src, (not (add GR32:$src, 1))), (BLCI32rr GR32:$src)>; def : Pat<(or GR64:$src, (not (add GR64:$src, 1))), (BLCI64rr GR64:$src)>; // Extra patterns because opt can optimize the above patterns to this. def : Pat<(or GR32:$src, (sub -2, GR32:$src)), (BLCI32rr GR32:$src)>; def : Pat<(or GR64:$src, (sub -2, GR64:$src)), (BLCI64rr GR64:$src)>; def : Pat<(and (not GR32:$src), (add GR32:$src, 1)), (BLCIC32rr GR32:$src)>; def : Pat<(and (not GR64:$src), (add GR64:$src, 1)), (BLCIC64rr GR64:$src)>; def : Pat<(xor GR32:$src, (add GR32:$src, 1)), (BLCMSK32rr GR32:$src)>; def : Pat<(xor GR64:$src, (add GR64:$src, 1)), (BLCMSK64rr GR64:$src)>; def : Pat<(or GR32:$src, (add GR32:$src, 1)), (BLCS32rr GR32:$src)>; def : Pat<(or GR64:$src, (add GR64:$src, 1)), (BLCS64rr GR64:$src)>; def : Pat<(or GR32:$src, (add GR32:$src, -1)), (BLSFILL32rr GR32:$src)>; def : Pat<(or GR64:$src, (add GR64:$src, -1)), (BLSFILL64rr GR64:$src)>; def : Pat<(or (not GR32:$src), (add GR32:$src, -1)), (BLSIC32rr GR32:$src)>; def : Pat<(or (not GR64:$src), (add GR64:$src, -1)), (BLSIC64rr GR64:$src)>; def : Pat<(or (not GR32:$src), (add GR32:$src, 1)), (T1MSKC32rr GR32:$src)>; def : Pat<(or (not GR64:$src), (add GR64:$src, 1)), (T1MSKC64rr GR64:$src)>; def : Pat<(and (not GR32:$src), (add GR32:$src, -1)), (TZMSK32rr GR32:$src)>; def : Pat<(and (not GR64:$src), (add GR64:$src, -1)), (TZMSK64rr GR64:$src)>; } // HasTBM //===----------------------------------------------------------------------===// // Memory Instructions // let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src), "clflushopt\t$src", [(int_x86_clflushopt addr:$src)], IIC_SSE_PREFETCH>, PD; let Predicates = [HasCLWB], SchedRW = [WriteLoad] in def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", [(int_x86_clwb addr:$src)], IIC_SSE_PREFETCH>, PD; //===----------------------------------------------------------------------===// // Subsystems. //===----------------------------------------------------------------------===// include "X86InstrArithmetic.td" include "X86InstrCMovSetCC.td" include "X86InstrExtension.td" include "X86InstrControl.td" include "X86InstrShiftRotate.td" // X87 Floating Point Stack. include "X86InstrFPStack.td" // SIMD support (SSE, MMX and AVX) include "X86InstrFragmentsSIMD.td" // FMA - Fused Multiply-Add support (requires FMA) include "X86InstrFMA.td" // XOP include "X86InstrXOP.td" // SSE, MMX and 3DNow! vector support. include "X86InstrSSE.td" include "X86InstrAVX512.td" include "X86InstrMMX.td" include "X86Instr3DNow.td" // MPX instructions include "X86InstrMPX.td" include "X86InstrVMX.td" include "X86InstrSVM.td" include "X86InstrTSX.td" include "X86InstrSGX.td" // System instructions. include "X86InstrSystem.td" // Compiler Pseudo Instructions and Pat Patterns include "X86InstrCompiler.td" include "X86InstrVecCompiler.td" //===----------------------------------------------------------------------===// // Assembler Mnemonic Aliases //===----------------------------------------------------------------------===// def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"cbw", "cbtw", "att">; def : MnemonicAlias<"cwde", "cwtl", "att">; def : MnemonicAlias<"cwd", "cwtd", "att">; def : MnemonicAlias<"cdq", "cltd", "att">; def : MnemonicAlias<"cdqe", "cltq", "att">; def : MnemonicAlias<"cqo", "cqto", "att">; // In 64-bit mode lret maps to lretl; it is not ambiguous with lretq. def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>; def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>; def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"loopz", "loope">; def : MnemonicAlias<"loopnz", "loopne">; def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"popf", "popfq", "intel">, Requires<[In64BitMode]>; def : MnemonicAlias<"popfd", "popfl", "att">; // FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in // all modes. However: "push (addr)" and "push $42" should default to // pushl/pushq depending on the current mode. Similar for "pop %bx" def : MnemonicAlias<"push", "pushw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"pushf", "pushfq", "intel">, Requires<[In64BitMode]>; def : MnemonicAlias<"pushfd", "pushfl", "att">; def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>; def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>; def : MnemonicAlias<"popa", "popaw", "intel">, Requires<[In16BitMode]>; def : MnemonicAlias<"pusha", "pushaw", "intel">, Requires<[In16BitMode]>; def : MnemonicAlias<"popa", "popal", "intel">, Requires<[In32BitMode]>; def : MnemonicAlias<"pusha", "pushal", "intel">, Requires<[In32BitMode]>; def : MnemonicAlias<"popa", "popaw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"repe", "rep">; def : MnemonicAlias<"repz", "rep">; def : MnemonicAlias<"repnz", "repne">; def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>; // Apply 'ret' behavior to 'retn' def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"retn", "ret", "intel">; def : MnemonicAlias<"sal", "shl", "intel">; def : MnemonicAlias<"salb", "shlb", "att">; def : MnemonicAlias<"salw", "shlw", "att">; def : MnemonicAlias<"sall", "shll", "att">; def : MnemonicAlias<"salq", "shlq", "att">; def : MnemonicAlias<"smovb", "movsb", "att">; def : MnemonicAlias<"smovw", "movsw", "att">; def : MnemonicAlias<"smovl", "movsl", "att">; def : MnemonicAlias<"smovq", "movsq", "att">; def : MnemonicAlias<"ud2a", "ud2", "att">; def : MnemonicAlias<"verrw", "verr", "att">; // MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release' def : MnemonicAlias<"acquire", "xacquire", "intel">; def : MnemonicAlias<"release", "xrelease", "intel">; // System instruction aliases. def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>; def : MnemonicAlias<"sysret", "sysretl", "att">; def : MnemonicAlias<"sysexit", "sysexitl", "att">; def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>; // Floating point stack aliases. def : MnemonicAlias<"fcmovz", "fcmove", "att">; def : MnemonicAlias<"fcmova", "fcmovnbe", "att">; def : MnemonicAlias<"fcmovnae", "fcmovb", "att">; def : MnemonicAlias<"fcmovna", "fcmovbe", "att">; def : MnemonicAlias<"fcmovae", "fcmovnb", "att">; def : MnemonicAlias<"fcomip", "fcompi">; def : MnemonicAlias<"fildq", "fildll", "att">; def : MnemonicAlias<"fistpq", "fistpll", "att">; def : MnemonicAlias<"fisttpq", "fisttpll", "att">; def : MnemonicAlias<"fldcww", "fldcw", "att">; def : MnemonicAlias<"fnstcww", "fnstcw", "att">; def : MnemonicAlias<"fnstsww", "fnstsw", "att">; def : MnemonicAlias<"fucomip", "fucompi">; def : MnemonicAlias<"fwait", "wait">; def : MnemonicAlias<"fxsaveq", "fxsave64", "att">; def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">; def : MnemonicAlias<"xsaveq", "xsave64", "att">; def : MnemonicAlias<"xrstorq", "xrstor64", "att">; def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">; def : MnemonicAlias<"xrstorsq", "xrstors64", "att">; def : MnemonicAlias<"xsavecq", "xsavec64", "att">; def : MnemonicAlias<"xsavesq", "xsaves64", "att">; class CondCodeAlias : MnemonicAlias; /// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of /// MnemonicAlias's that canonicalize the condition code in a mnemonic, for /// example "setz" -> "sete". multiclass IntegerCondCodeMnemonicAlias { def C : CondCodeAlias; // setc -> setb def Z : CondCodeAlias; // setz -> sete def NA : CondCodeAlias; // setna -> setbe def NB : CondCodeAlias; // setnb -> setae def NC : CondCodeAlias; // setnc -> setae def NG : CondCodeAlias; // setng -> setle def NL : CondCodeAlias; // setnl -> setge def NZ : CondCodeAlias; // setnz -> setne def PE : CondCodeAlias; // setpe -> setp def PO : CondCodeAlias; // setpo -> setnp def NAE : CondCodeAlias; // setnae -> setb def NBE : CondCodeAlias; // setnbe -> seta def NGE : CondCodeAlias; // setnge -> setl def NLE : CondCodeAlias; // setnle -> setg } // Aliases for set defm : IntegerCondCodeMnemonicAlias<"set", "">; // Aliases for j defm : IntegerCondCodeMnemonicAlias<"j", "">; // Aliases for cmov{w,l,q} defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">; defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">; defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">; // No size suffix for intel-style asm. defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">; //===----------------------------------------------------------------------===// // Assembler Instruction Aliases //===----------------------------------------------------------------------===// // aad/aam default to base 10 if no operand is specified. def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>; def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. // Likewise for btc/btr/bts. def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}", (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>; def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}", (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>; def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}", (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>; def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}", (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>; // clr aliases. def : InstAlias<"clrb\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; // lods aliases. Accept the destination being omitted because it's implicit // in the mnemonic, or the mnemonic suffix being omitted because it's implicit // in the destination. def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src), 0>; def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>; def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>; def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src), 0>; def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>; def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>; def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; def : InstAlias<"lods\t$src", (LODSB srcidx8:$src), 0>; def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0>; def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0>; def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; // stos aliases. Accept the source being omitted because it's implicit in // the mnemonic, or the mnemonic suffix being omitted because it's implicit // in the source. def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst), 0>; def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>; def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>; def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>; def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>; def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>; def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst), 0>; def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0>; def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0>; def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; // scas aliases. Accept the destination being omitted because it's implicit // in the mnemonic, or the mnemonic suffix being omitted because it's implicit // in the destination. def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst), 0>; def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>; def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>; def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>; def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>; def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>; def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst), 0>; def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0>; def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0>; def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; // cmps aliases. Mnemonic suffix being omitted because it's implicit // in the destination. def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src), 0>; def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0>; def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0>; def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>; // movs aliases. Mnemonic suffix being omitted because it's implicit // in the destination. def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src), 0>; def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0>; def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0>; def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>; // div and idiv aliases for explicit A register. def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>; def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>; def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>; def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>; def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>; def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>; def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>; def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>; def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>; def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>; def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>; def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>; def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>; def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>; def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>; def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>; def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>; def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; def : InstAlias<"fxch", (XCH_F ST1), 0>; def : InstAlias<"fcom", (COM_FST0r ST1), 0>; def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>; def : InstAlias<"fcomi", (COM_FIr ST1), 0>; def : InstAlias<"fcompi", (COM_FIPr ST1), 0>; def : InstAlias<"fucom", (UCOM_Fr ST1), 0>; def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>; def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>; def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; // Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with // gas. multiclass FpUnaryAlias { def : InstAlias; def : InstAlias; } defm : FpUnaryAlias<"fadd", ADD_FST0r>; defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; defm : FpUnaryAlias<"fsub", SUB_FST0r>; defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; defm : FpUnaryAlias<"fmul", MUL_FST0r>; defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; defm : FpUnaryAlias<"fdiv", DIV_FST0r>; defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; defm : FpUnaryAlias<"fcompi", COM_FIPr>; defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they // commute. We also allow fdiv[r]p/fsubrp even though they don't commute, // solely because gas supports it. def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; // We accept "fnstsw %eax" even though it only writes %ax. def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>; def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>; def : InstAlias<"fnstsw" , (FNSTSW16r)>; // lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but // this is compatible with what GAS does. def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>; def : InstAlias<"ljmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>; def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; def : InstAlias<"ljmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; def : InstAlias<"call\t{*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>; def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>; def : InstAlias<"call\t{*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>; def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>; def : InstAlias<"call\t{*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; // "imul , B" is an alias for "imul , B, B". def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; // ins aliases. Accept the mnemonic suffix being omitted because it's implicit // in the destination. def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst), 0>; def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst), 0>; def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst), 0>; // outs aliases. Accept the mnemonic suffix being omitted because it's implicit // in the source. def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src), 0>; def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src), 0>; def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src), 0>; // inb %dx -> inb %al, %dx def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>; def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>; def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>; // jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp def : InstAlias<"call\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; def : InstAlias<"jmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; def : InstAlias<"call\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>; def : InstAlias<"jmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>; def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; // Force mov without a suffix with a segment and mem to prefer the 'l' form of // the move. All segment/mem forms are equivalent, this has the shortest // encoding. def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV16sm SEGMENT_REG:$seg, i16mem:$mem), 0>; def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV16ms i16mem:$mem, SEGMENT_REG:$seg), 0>; // Match 'movq , ' as an alias for movabsq. def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; // Match 'movq GR64, MMX' as an alias for movd. def : InstAlias<"movq\t{$src, $dst|$dst, $src}", (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; def : InstAlias<"movq\t{$src, $dst|$dst, $src}", (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; // movsx aliases def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; // movzx aliases def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>; def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>; def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>; def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>; // 'sldt ' can be encoded with either sldtw or sldtq with the same // effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity // errors, since its encoding is the most compact. def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>; // shld/shrd op,op -> shld op, op, CL def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>; def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>; def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>; def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>; def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>; def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>; def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>; def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>; def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>; def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>; def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>; def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>; /* FIXME: This is disabled because the asm matcher is currently incapable of * matching a fixed immediate like $1. // "shl X, $1" is an alias for "shl X". multiclass ShiftRotateByOneAlias { def : InstAlias(!strconcat(Opc, "8r1")) GR8:$op)>; def : InstAlias(!strconcat(Opc, "16r1")) GR16:$op)>; def : InstAlias(!strconcat(Opc, "32r1")) GR32:$op)>; def : InstAlias(!strconcat(Opc, "64r1")) GR64:$op)>; def : InstAlias(!strconcat(Opc, "8m1")) i8mem:$op)>; def : InstAlias(!strconcat(Opc, "16m1")) i16mem:$op)>; def : InstAlias(!strconcat(Opc, "32m1")) i32mem:$op)>; def : InstAlias(!strconcat(Opc, "64m1")) i64mem:$op)>; } defm : ShiftRotateByOneAlias<"rcl", "RCL">; defm : ShiftRotateByOneAlias<"rcr", "RCR">; defm : ShiftRotateByOneAlias<"rol", "ROL">; defm : ShiftRotateByOneAlias<"ror", "ROR">; FIXME */ // test: We accept "testX , " and "testX , " as synonyms. def : InstAlias<"test{b}\t{$mem, $val|$val, $mem}", (TEST8mr i8mem :$mem, GR8 :$val), 0>; def : InstAlias<"test{w}\t{$mem, $val|$val, $mem}", (TEST16mr i16mem:$mem, GR16:$val), 0>; def : InstAlias<"test{l}\t{$mem, $val|$val, $mem}", (TEST32mr i32mem:$mem, GR32:$val), 0>; def : InstAlias<"test{q}\t{$mem, $val|$val, $mem}", (TEST64mr i64mem:$mem, GR64:$val), 0>; // xchg: We accept "xchgX , " and "xchgX , " as synonyms. def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm GR8 :$val, i8mem :$mem), 0>; def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem), 0>; def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem), 0>; def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem), 0>; // xchg: We accept "xchgX , %eax" and "xchgX %eax, " as synonyms. def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>; def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>; def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>; def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>; // These aliases exist to get the parser to prioritize matching 8-bit // immediate encodings over matching the implicit ax/eax/rax encodings. By // explicitly mentioning the A register here, these entries will be ordered // first due to the more explicit immediate type. def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>; def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>; def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>; def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>; def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>; def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>; def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>; def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>; def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>; def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>; def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>; def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>; def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>; def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>; def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>; def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>; def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>; def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>; def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>; def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>; def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>; def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>; def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>; def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>; Index: vendor/llvm/dist/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- vendor/llvm/dist/lib/Transforms/Utils/LoopUtils.cpp (revision 327151) +++ vendor/llvm/dist/lib/Transforms/Utils/LoopUtils.cpp (revision 327152) @@ -1,1649 +1,1650 @@ //===-- LoopUtils.cpp - Loop Utility functions -------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines common loop utility functions. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-utils" bool RecurrenceDescriptor::areAllUsesIn(Instruction *I, SmallPtrSetImpl &Set) { for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) if (!Set.count(dyn_cast(*Use))) return false; return true; } bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurrenceKind Kind) { switch (Kind) { default: break; case RK_IntegerAdd: case RK_IntegerMult: case RK_IntegerOr: case RK_IntegerAnd: case RK_IntegerXor: case RK_IntegerMinMax: return true; } return false; } bool RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind Kind) { return (Kind != RK_NoRecurrence) && !isIntegerRecurrenceKind(Kind); } bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) { switch (Kind) { default: break; case RK_IntegerAdd: case RK_IntegerMult: case RK_FloatAdd: case RK_FloatMult: return true; } return false; } Instruction * RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT, SmallPtrSetImpl &Visited, SmallPtrSetImpl &CI) { if (!Phi->hasOneUse()) return Phi; const APInt *M = nullptr; Instruction *I, *J = cast(Phi->use_begin()->getUser()); // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT // with a new integer type of the corresponding bit width. if (match(J, m_c_And(m_Instruction(I), m_APInt(M)))) { int32_t Bits = (*M + 1).exactLogBase2(); if (Bits > 0) { RT = IntegerType::get(Phi->getContext(), Bits); Visited.insert(Phi); CI.insert(J); return J; } } return Phi; } bool RecurrenceDescriptor::getSourceExtensionKind( Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned, SmallPtrSetImpl &Visited, SmallPtrSetImpl &CI) { SmallVector Worklist; bool FoundOneOperand = false; unsigned DstSize = RT->getPrimitiveSizeInBits(); Worklist.push_back(Exit); // Traverse the instructions in the reduction expression, beginning with the // exit value. while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); for (Use &U : I->operands()) { // Terminate the traversal if the operand is not an instruction, or we // reach the starting value. Instruction *J = dyn_cast(U.get()); if (!J || J == Start) continue; // Otherwise, investigate the operation if it is also in the expression. if (Visited.count(J)) { Worklist.push_back(J); continue; } // If the operand is not in Visited, it is not a reduction operation, but // it does feed into one. Make sure it is either a single-use sign- or // zero-extend instruction. CastInst *Cast = dyn_cast(J); bool IsSExtInst = isa(J); if (!Cast || !Cast->hasOneUse() || !(isa(J) || IsSExtInst)) return false; // Ensure the source type of the extend is no larger than the reduction // type. It is not necessary for the types to be identical. unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); if (SrcSize > DstSize) return false; // Furthermore, ensure that all such extends are of the same kind. if (FoundOneOperand) { if (IsSigned != IsSExtInst) return false; } else { FoundOneOperand = true; IsSigned = IsSExtInst; } // Lastly, if the source type of the extend matches the reduction type, // add the extend to CI so that we can avoid accounting for it in the // cost model. if (SrcSize == DstSize) CI.insert(Cast); } } return true; } bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, Loop *TheLoop, bool HasFunNoNaNAttr, RecurrenceDescriptor &RedDes) { if (Phi->getNumIncomingValues() != 2) return false; // Reduction variables are only found in the loop header block. if (Phi->getParent() != TheLoop->getHeader()) return false; // Obtain the reduction start value from the value that comes from the loop // preheader. Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); // ExitInstruction is the single value which is used outside the loop. // We only allow for a single reduction value to be used outside the loop. // This includes users of the reduction, variables (which form a cycle // which ends in the phi node). Instruction *ExitInstruction = nullptr; // Indicates that we found a reduction operation in our scan. bool FoundReduxOp = false; // We start with the PHI node and scan for all of the users of this // instruction. All users must be instructions that can be used as reduction // variables (such as ADD). We must have a single out-of-block user. The cycle // must include the original PHI. bool FoundStartPHI = false; // To recognize min/max patterns formed by a icmp select sequence, we store // the number of instruction we saw from the recognized min/max pattern, // to make sure we only see exactly the two instructions. unsigned NumCmpSelectPatternInst = 0; InstDesc ReduxDesc(false, nullptr); // Data used for determining if the recurrence has been type-promoted. Type *RecurrenceType = Phi->getType(); SmallPtrSet CastInsts; Instruction *Start = Phi; bool IsSigned = false; SmallPtrSet VisitedInsts; SmallVector Worklist; // Return early if the recurrence kind does not match the type of Phi. If the // recurrence kind is arithmetic, we attempt to look through AND operations // resulting from the type promotion performed by InstCombine. Vector // operations are not limited to the legal integer widths, so we may be able // to evaluate the reduction in the narrower width. if (RecurrenceType->isFloatingPointTy()) { if (!isFloatingPointRecurrenceKind(Kind)) return false; } else { if (!isIntegerRecurrenceKind(Kind)) return false; if (isArithmeticRecurrenceKind(Kind)) Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts); } Worklist.push_back(Start); VisitedInsts.insert(Start); // A value in the reduction can be used: // - By the reduction: // - Reduction operation: // - One use of reduction value (safe). // - Multiple use of reduction value (not safe). // - PHI: // - All uses of the PHI must be the reduction (safe). // - Otherwise, not safe. // - By instructions outside of the loop (safe). // * One value may have several outside users, but all outside // uses must be of the same value. // - By an instruction that is not part of the reduction (not safe). // This is either: // * An instruction type other than PHI or the reduction operation. // * A PHI in the header other than the initial PHI. while (!Worklist.empty()) { Instruction *Cur = Worklist.back(); Worklist.pop_back(); // No Users. // If the instruction has no users then this is a broken chain and can't be // a reduction variable. if (Cur->use_empty()) return false; bool IsAPhi = isa(Cur); // A header PHI use other than the original PHI. if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) return false; // Reductions of instructions such as Div, and Sub is only possible if the // LHS is the reduction variable. if (!Cur->isCommutative() && !IsAPhi && !isa(Cur) && !isa(Cur) && !isa(Cur) && !VisitedInsts.count(dyn_cast(Cur->getOperand(0)))) return false; // Any reduction instruction must be of one of the allowed kinds. We ignore // the starting value (the Phi or an AND instruction if the Phi has been // type-promoted). if (Cur != Start) { ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr); if (!ReduxDesc.isRecurrence()) return false; } // A reduction operation must only have one use of the reduction value. if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && hasMultipleUsesOf(Cur, VisitedInsts)) return false; // All inputs to a PHI node must be a reduction value. if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) return false; if (Kind == RK_IntegerMinMax && (isa(Cur) || isa(Cur))) ++NumCmpSelectPatternInst; if (Kind == RK_FloatMinMax && (isa(Cur) || isa(Cur))) ++NumCmpSelectPatternInst; // Check whether we found a reduction operator. FoundReduxOp |= !IsAPhi && Cur != Start; // Process users of current instruction. Push non-PHI nodes after PHI nodes // onto the stack. This way we are going to have seen all inputs to PHI // nodes once we get to them. SmallVector NonPHIs; SmallVector PHIs; for (User *U : Cur->users()) { Instruction *UI = cast(U); // Check if we found the exit user. BasicBlock *Parent = UI->getParent(); if (!TheLoop->contains(Parent)) { // If we already know this instruction is used externally, move on to // the next user. if (ExitInstruction == Cur) continue; // Exit if you find multiple values used outside or if the header phi // node is being used. In this case the user uses the value of the // previous iteration, in which case we would loose "VF-1" iterations of // the reduction operation if we vectorize. if (ExitInstruction != nullptr || Cur == Phi) return false; // The instruction used by an outside user must be the last instruction // before we feed back to the reduction phi. Otherwise, we loose VF-1 // operations on the value. if (!is_contained(Phi->operands(), Cur)) return false; ExitInstruction = Cur; continue; } // Process instructions only once (termination). Each reduction cycle // value must only be used once, except by phi nodes and min/max // reductions which are represented as a cmp followed by a select. InstDesc IgnoredVal(false, nullptr); if (VisitedInsts.insert(UI).second) { if (isa(UI)) PHIs.push_back(UI); else NonPHIs.push_back(UI); } else if (!isa(UI) && ((!isa(UI) && !isa(UI) && !isa(UI)) || !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence())) return false; // Remember that we completed the cycle. if (UI == Phi) FoundStartPHI = true; } Worklist.append(PHIs.begin(), PHIs.end()); Worklist.append(NonPHIs.begin(), NonPHIs.end()); } // This means we have seen one but not the other instruction of the // pattern or more than just a select and cmp. if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && NumCmpSelectPatternInst != 2) return false; if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; // If we think Phi may have been type-promoted, we also need to ensure that // all source operands of the reduction are either SExtInsts or ZEstInsts. If // so, we will be able to evaluate the reduction in the narrower bit width. if (Start != Phi) if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType, IsSigned, VisitedInsts, CastInsts)) return false; // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. // The ExitInstruction(Instruction which is allowed to have out-of-loop users) // is saved as part of the RecurrenceDescriptor. // Save the description of this reduction variable. RecurrenceDescriptor RD( RdxStart, ExitInstruction, Kind, ReduxDesc.getMinMaxKind(), ReduxDesc.getUnsafeAlgebraInst(), RecurrenceType, IsSigned, CastInsts); RedDes = RD; return true; } /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction /// pattern corresponding to a min(X, Y) or max(X, Y). RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev) { assert((isa(I) || isa(I) || isa(I)) && "Expect a select instruction"); Instruction *Cmp = nullptr; SelectInst *Select = nullptr; // We must handle the select(cmp()) as a single instruction. Advance to the // select. if ((Cmp = dyn_cast(I)) || (Cmp = dyn_cast(I))) { if (!Cmp->hasOneUse() || !(Select = dyn_cast(*I->user_begin()))) return InstDesc(false, I); return InstDesc(Select, Prev.getMinMaxKind()); } // Only handle single use cases for now. if (!(Select = dyn_cast(I))) return InstDesc(false, I); if (!(Cmp = dyn_cast(I->getOperand(0))) && !(Cmp = dyn_cast(I->getOperand(0)))) return InstDesc(false, I); if (!Cmp->hasOneUse()) return InstDesc(false, I); Value *CmpLeft; Value *CmpRight; // Look for a min/max pattern. if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return InstDesc(Select, MRK_UIntMin); else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return InstDesc(Select, MRK_UIntMax); else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return InstDesc(Select, MRK_SIntMax); else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return InstDesc(Select, MRK_SIntMin); else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return InstDesc(Select, MRK_FloatMin); else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return InstDesc(Select, MRK_FloatMax); else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return InstDesc(Select, MRK_FloatMin); else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) return InstDesc(Select, MRK_FloatMax); return InstDesc(false, I); } RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, InstDesc &Prev, bool HasFunNoNaNAttr) { bool FP = I->getType()->isFloatingPointTy(); Instruction *UAI = Prev.getUnsafeAlgebraInst(); if (!UAI && FP && !I->isFast()) UAI = I; // Found an unsafe (unvectorizable) algebra instruction. switch (I->getOpcode()) { default: return InstDesc(false, I); case Instruction::PHI: return InstDesc(I, Prev.getMinMaxKind(), Prev.getUnsafeAlgebraInst()); case Instruction::Sub: case Instruction::Add: return InstDesc(Kind == RK_IntegerAdd, I); case Instruction::Mul: return InstDesc(Kind == RK_IntegerMult, I); case Instruction::And: return InstDesc(Kind == RK_IntegerAnd, I); case Instruction::Or: return InstDesc(Kind == RK_IntegerOr, I); case Instruction::Xor: return InstDesc(Kind == RK_IntegerXor, I); case Instruction::FMul: return InstDesc(Kind == RK_FloatMult, I, UAI); case Instruction::FSub: case Instruction::FAdd: return InstDesc(Kind == RK_FloatAdd, I, UAI); case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: if (Kind != RK_IntegerMinMax && (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) return InstDesc(false, I); return isMinMaxSelectCmpPattern(I, Prev); } } bool RecurrenceDescriptor::hasMultipleUsesOf( Instruction *I, SmallPtrSetImpl &Insts) { unsigned NumUses = 0; for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { if (Insts.count(dyn_cast(*Use))) ++NumUses; if (NumUses > 1) return true; } return false; } bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes) { BasicBlock *Header = TheLoop->getHeader(); Function &F = *Header->getParent(); bool HasFunNoNaNAttr = F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes)) { DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes)) { DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes)) { DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes)) { DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes)) { DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr, RedDes)) { DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes)) { DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes)) { DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes)) { DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi << "\n"); return true; } // Not a reduction of known type. return false; } bool RecurrenceDescriptor::isFirstOrderRecurrence( PHINode *Phi, Loop *TheLoop, DenseMap &SinkAfter, DominatorTree *DT) { // Ensure the phi node is in the loop header and has two incoming values. if (Phi->getParent() != TheLoop->getHeader() || Phi->getNumIncomingValues() != 2) return false; // Ensure the loop has a preheader and a single latch block. The loop // vectorizer will need the latch to set up the next iteration of the loop. auto *Preheader = TheLoop->getLoopPreheader(); auto *Latch = TheLoop->getLoopLatch(); if (!Preheader || !Latch) return false; // Ensure the phi node's incoming blocks are the loop preheader and latch. if (Phi->getBasicBlockIndex(Preheader) < 0 || Phi->getBasicBlockIndex(Latch) < 0) return false; // Get the previous value. The previous value comes from the latch edge while // the initial value comes form the preheader edge. auto *Previous = dyn_cast(Phi->getIncomingValueForBlock(Latch)); if (!Previous || !TheLoop->contains(Previous) || isa(Previous) || SinkAfter.count(Previous)) // Cannot rely on dominance due to motion. return false; // Ensure every user of the phi node is dominated by the previous value. // The dominance requirement ensures the loop vectorizer will not need to // vectorize the initial value prior to the first iteration of the loop. // TODO: Consider extending this sinking to handle other kinds of instructions // and expressions, beyond sinking a single cast past Previous. if (Phi->hasOneUse()) { auto *I = Phi->user_back(); if (I->isCast() && (I->getParent() == Phi->getParent()) && I->hasOneUse() && DT->dominates(Previous, I->user_back())) { if (!DT->dominates(Previous, I)) // Otherwise we're good w/o sinking. SinkAfter[I] = Previous; return true; } } for (User *U : Phi->users()) if (auto *I = dyn_cast(U)) { if (!DT->dominates(Previous, I)) return false; } return true; } /// This function returns the identity element (or neutral element) for /// the operation K. Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K, Type *Tp) { switch (K) { case RK_IntegerXor: case RK_IntegerAdd: case RK_IntegerOr: // Adding, Xoring, Oring zero to a number does not change it. return ConstantInt::get(Tp, 0); case RK_IntegerMult: // Multiplying a number by 1 does not change it. return ConstantInt::get(Tp, 1); case RK_IntegerAnd: // AND-ing a number with an all-1 value does not change it. return ConstantInt::get(Tp, -1, true); case RK_FloatMult: // Multiplying a number by 1 does not change it. return ConstantFP::get(Tp, 1.0L); case RK_FloatAdd: // Adding zero to a number does not change it. return ConstantFP::get(Tp, 0.0L); default: llvm_unreachable("Unknown recurrence kind"); } } /// This function translates the recurrence kind to an LLVM binary operator. unsigned RecurrenceDescriptor::getRecurrenceBinOp(RecurrenceKind Kind) { switch (Kind) { case RK_IntegerAdd: return Instruction::Add; case RK_IntegerMult: return Instruction::Mul; case RK_IntegerOr: return Instruction::Or; case RK_IntegerAnd: return Instruction::And; case RK_IntegerXor: return Instruction::Xor; case RK_FloatMult: return Instruction::FMul; case RK_FloatAdd: return Instruction::FAdd; case RK_IntegerMinMax: return Instruction::ICmp; case RK_FloatMinMax: return Instruction::FCmp; default: llvm_unreachable("Unknown recurrence operation"); } } Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder, MinMaxRecurrenceKind RK, Value *Left, Value *Right) { CmpInst::Predicate P = CmpInst::ICMP_NE; switch (RK) { default: llvm_unreachable("Unknown min/max recurrence kind"); case MRK_UIntMin: P = CmpInst::ICMP_ULT; break; case MRK_UIntMax: P = CmpInst::ICMP_UGT; break; case MRK_SIntMin: P = CmpInst::ICMP_SLT; break; case MRK_SIntMax: P = CmpInst::ICMP_SGT; break; case MRK_FloatMin: P = CmpInst::FCMP_OLT; break; case MRK_FloatMax: P = CmpInst::FCMP_OGT; break; } // We only match FP sequences that are 'fast', so we can unconditionally // set it on any generated instructions. IRBuilder<>::FastMathFlagGuard FMFG(Builder); FastMathFlags FMF; FMF.setFast(); Builder.setFastMathFlags(FMF); Value *Cmp; if (RK == MRK_FloatMin || RK == MRK_FloatMax) Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); else Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); return Select; } InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step, BinaryOperator *BOp, SmallVectorImpl *Casts) : StartValue(Start), IK(K), Step(Step), InductionBinOp(BOp) { assert(IK != IK_NoInduction && "Not an induction"); // Start value type should match the induction kind and the value // itself should not be null. assert(StartValue && "StartValue is null"); assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && "StartValue is not a pointer for pointer induction"); assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && "StartValue is not an integer for integer induction"); // Check the Step Value. It should be non-zero integer value. assert((!getConstIntStepValue() || !getConstIntStepValue()->isZero()) && "Step value is zero"); assert((IK != IK_PtrInduction || getConstIntStepValue()) && "Step value should be constant for pointer induction"); assert((IK == IK_FpInduction || Step->getType()->isIntegerTy()) && "StepValue is not an integer"); assert((IK != IK_FpInduction || Step->getType()->isFloatingPointTy()) && "StepValue is not FP for FpInduction"); assert((IK != IK_FpInduction || (InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub))) && "Binary opcode should be specified for FP induction"); if (Casts) { for (auto &Inst : *Casts) { RedundantCasts.push_back(Inst); } } } int InductionDescriptor::getConsecutiveDirection() const { ConstantInt *ConstStep = getConstIntStepValue(); if (ConstStep && (ConstStep->isOne() || ConstStep->isMinusOne())) return ConstStep->getSExtValue(); return 0; } ConstantInt *InductionDescriptor::getConstIntStepValue() const { if (isa(Step)) return dyn_cast(cast(Step)->getValue()); return nullptr; } Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout& DL) const { SCEVExpander Exp(*SE, DL, "induction"); assert(Index->getType() == Step->getType() && "Index type does not match StepValue type"); switch (IK) { case IK_IntInduction: { assert(Index->getType() == StartValue->getType() && "Index type does not match StartValue type"); // FIXME: Theoretically, we can call getAddExpr() of ScalarEvolution // and calculate (Start + Index * Step) for all cases, without // special handling for "isOne" and "isMinusOne". // But in the real life the result code getting worse. We mix SCEV // expressions and ADD/SUB operations and receive redundant // intermediate values being calculated in different ways and // Instcombine is unable to reduce them all. if (getConstIntStepValue() && getConstIntStepValue()->isMinusOne()) return B.CreateSub(StartValue, Index); if (getConstIntStepValue() && getConstIntStepValue()->isOne()) return B.CreateAdd(StartValue, Index); const SCEV *S = SE->getAddExpr(SE->getSCEV(StartValue), SE->getMulExpr(Step, SE->getSCEV(Index))); return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint()); } case IK_PtrInduction: { assert(isa(Step) && "Expected constant step for pointer induction"); const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step); Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint()); return B.CreateGEP(nullptr, StartValue, Index); } case IK_FpInduction: { assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); assert(InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && "Original bin op should be defined for FP induction"); Value *StepValue = cast(Step)->getValue(); // Floating point operations had to be 'fast' to enable the induction. FastMathFlags Flags; Flags.setFast(); Value *MulExp = B.CreateFMul(StepValue, Index); if (isa(MulExp)) // We have to check, the MulExp may be a constant. cast(MulExp)->setFastMathFlags(Flags); Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode() , StartValue, MulExp, "induction"); if (isa(BOp)) cast(BOp)->setFastMathFlags(Flags); return BOp; } case IK_NoInduction: return nullptr; } llvm_unreachable("invalid enum"); } bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop, ScalarEvolution *SE, InductionDescriptor &D) { // Here we only handle FP induction variables. assert(Phi->getType()->isFloatingPointTy() && "Unexpected Phi type"); if (TheLoop->getHeader() != Phi->getParent()) return false; // The loop may have multiple entrances or multiple exits; we can analyze // this phi if it has a unique entry value and a unique backedge value. if (Phi->getNumIncomingValues() != 2) return false; Value *BEValue = nullptr, *StartValue = nullptr; if (TheLoop->contains(Phi->getIncomingBlock(0))) { BEValue = Phi->getIncomingValue(0); StartValue = Phi->getIncomingValue(1); } else { assert(TheLoop->contains(Phi->getIncomingBlock(1)) && "Unexpected Phi node in the loop"); BEValue = Phi->getIncomingValue(1); StartValue = Phi->getIncomingValue(0); } BinaryOperator *BOp = dyn_cast(BEValue); if (!BOp) return false; Value *Addend = nullptr; if (BOp->getOpcode() == Instruction::FAdd) { if (BOp->getOperand(0) == Phi) Addend = BOp->getOperand(1); else if (BOp->getOperand(1) == Phi) Addend = BOp->getOperand(0); } else if (BOp->getOpcode() == Instruction::FSub) if (BOp->getOperand(0) == Phi) Addend = BOp->getOperand(1); if (!Addend) return false; // The addend should be loop invariant if (auto *I = dyn_cast(Addend)) if (TheLoop->contains(I)) return false; // FP Step has unknown SCEV const SCEV *Step = SE->getUnknown(Addend); D = InductionDescriptor(StartValue, IK_FpInduction, Step, BOp); return true; } /// This function is called when we suspect that the update-chain of a phi node /// (whose symbolic SCEV expression sin \p PhiScev) contains redundant casts, /// that can be ignored. (This can happen when the PSCEV rewriter adds a runtime /// predicate P under which the SCEV expression for the phi can be the /// AddRecurrence \p AR; See createAddRecFromPHIWithCast). We want to find the /// cast instructions that are involved in the update-chain of this induction. /// A caller that adds the required runtime predicate can be free to drop these /// cast instructions, and compute the phi using \p AR (instead of some scev /// expression with casts). /// /// For example, without a predicate the scev expression can take the following /// form: /// (Ext ix (Trunc iy ( Start + i*Step ) to ix) to iy) /// /// It corresponds to the following IR sequence: /// %for.body: /// %x = phi i64 [ 0, %ph ], [ %add, %for.body ] /// %casted_phi = "ExtTrunc i64 %x" /// %add = add i64 %casted_phi, %step /// /// where %x is given in \p PN, /// PSE.getSCEV(%x) is equal to PSE.getSCEV(%casted_phi) under a predicate, /// and the IR sequence that "ExtTrunc i64 %x" represents can take one of /// several forms, for example, such as: /// ExtTrunc1: %casted_phi = and %x, 2^n-1 /// or: /// ExtTrunc2: %t = shl %x, m /// %casted_phi = ashr %t, m /// /// If we are able to find such sequence, we return the instructions /// we found, namely %casted_phi and the instructions on its use-def chain up /// to the phi (not including the phi). -bool getCastsForInductionPHI( - PredicatedScalarEvolution &PSE, const SCEVUnknown *PhiScev, - const SCEVAddRecExpr *AR, SmallVectorImpl &CastInsts) { +static bool getCastsForInductionPHI(PredicatedScalarEvolution &PSE, + const SCEVUnknown *PhiScev, + const SCEVAddRecExpr *AR, + SmallVectorImpl &CastInsts) { assert(CastInsts.empty() && "CastInsts is expected to be empty."); auto *PN = cast(PhiScev->getValue()); assert(PSE.getSCEV(PN) == AR && "Unexpected phi node SCEV expression"); const Loop *L = AR->getLoop(); // Find any cast instructions that participate in the def-use chain of // PhiScev in the loop. // FORNOW/TODO: We currently expect the def-use chain to include only // two-operand instructions, where one of the operands is an invariant. // createAddRecFromPHIWithCasts() currently does not support anything more // involved than that, so we keep the search simple. This can be // extended/generalized as needed. auto getDef = [&](const Value *Val) -> Value * { const BinaryOperator *BinOp = dyn_cast(Val); if (!BinOp) return nullptr; Value *Op0 = BinOp->getOperand(0); Value *Op1 = BinOp->getOperand(1); Value *Def = nullptr; if (L->isLoopInvariant(Op0)) Def = Op1; else if (L->isLoopInvariant(Op1)) Def = Op0; return Def; }; // Look for the instruction that defines the induction via the // loop backedge. BasicBlock *Latch = L->getLoopLatch(); if (!Latch) return false; Value *Val = PN->getIncomingValueForBlock(Latch); if (!Val) return false; // Follow the def-use chain until the induction phi is reached. // If on the way we encounter a Value that has the same SCEV Expr as the // phi node, we can consider the instructions we visit from that point // as part of the cast-sequence that can be ignored. bool InCastSequence = false; auto *Inst = dyn_cast(Val); while (Val != PN) { // If we encountered a phi node other than PN, or if we left the loop, // we bail out. if (!Inst || !L->contains(Inst)) { return false; } auto *AddRec = dyn_cast(PSE.getSCEV(Val)); if (AddRec && PSE.areAddRecsEqualWithPreds(AddRec, AR)) InCastSequence = true; if (InCastSequence) { // Only the last instruction in the cast sequence is expected to have // uses outside the induction def-use chain. if (!CastInsts.empty()) if (!Inst->hasOneUse()) return false; CastInsts.push_back(Inst); } Val = getDef(Val); if (!Val) return false; Inst = dyn_cast(Val); } return InCastSequence; } bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop, PredicatedScalarEvolution &PSE, InductionDescriptor &D, bool Assume) { Type *PhiTy = Phi->getType(); // Handle integer and pointer inductions variables. // Now we handle also FP induction but not trying to make a // recurrent expression from the PHI node in-place. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy() && !PhiTy->isFloatTy() && !PhiTy->isDoubleTy() && !PhiTy->isHalfTy()) return false; if (PhiTy->isFloatingPointTy()) return isFPInductionPHI(Phi, TheLoop, PSE.getSE(), D); const SCEV *PhiScev = PSE.getSCEV(Phi); const auto *AR = dyn_cast(PhiScev); // We need this expression to be an AddRecExpr. if (Assume && !AR) AR = PSE.getAsAddRec(Phi); if (!AR) { DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); return false; } // Record any Cast instructions that participate in the induction update const auto *SymbolicPhi = dyn_cast(PhiScev); // If we started from an UnknownSCEV, and managed to build an addRecurrence // only after enabling Assume with PSCEV, this means we may have encountered // cast instructions that required adding a runtime check in order to // guarantee the correctness of the AddRecurence respresentation of the // induction. if (PhiScev != AR && SymbolicPhi) { SmallVector Casts; if (getCastsForInductionPHI(PSE, SymbolicPhi, AR, Casts)) return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR, &Casts); } return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR); } bool InductionDescriptor::isInductionPHI( PHINode *Phi, const Loop *TheLoop, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr, SmallVectorImpl *CastsToIgnore) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) return false; // Check that the PHI is consecutive. const SCEV *PhiScev = Expr ? Expr : SE->getSCEV(Phi); const SCEVAddRecExpr *AR = dyn_cast(PhiScev); if (!AR) { DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); return false; } if (AR->getLoop() != TheLoop) { // FIXME: We should treat this as a uniform. Unfortunately, we // don't currently know how to handled uniform PHIs. DEBUG(dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n"); return false; } Value *StartValue = Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader()); const SCEV *Step = AR->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. // The stride may be a constant or a loop invariant integer value. const SCEVConstant *ConstStep = dyn_cast(Step); if (!ConstStep && !SE->isLoopInvariant(Step, TheLoop)) return false; if (PhiTy->isIntegerTy()) { D = InductionDescriptor(StartValue, IK_IntInduction, Step, /*BOp=*/ nullptr, CastsToIgnore); return true; } assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); // Pointer induction should be a constant. if (!ConstStep) return false; ConstantInt *CV = ConstStep->getValue(); Type *PointerElementType = PhiTy->getPointerElementType(); // The pointer stride cannot be determined if the pointer element type is not // sized. if (!PointerElementType->isSized()) return false; const DataLayout &DL = Phi->getModule()->getDataLayout(); int64_t Size = static_cast(DL.getTypeAllocSize(PointerElementType)); if (!Size) return false; int64_t CVSize = CV->getSExtValue(); if (CVSize % Size) return false; auto *StepValue = SE->getConstant(CV->getType(), CVSize / Size, true /* signed */); D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue); return true; } bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) { bool Changed = false; // We re-use a vector for the in-loop predecesosrs. SmallVector InLoopPredecessors; auto RewriteExit = [&](BasicBlock *BB) { assert(InLoopPredecessors.empty() && "Must start with an empty predecessors list!"); auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); }); // See if there are any non-loop predecessors of this exit block and // keep track of the in-loop predecessors. bool IsDedicatedExit = true; for (auto *PredBB : predecessors(BB)) if (L->contains(PredBB)) { if (isa(PredBB->getTerminator())) // We cannot rewrite exiting edges from an indirectbr. return false; InLoopPredecessors.push_back(PredBB); } else { IsDedicatedExit = false; } assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!"); // Nothing to do if this is already a dedicated exit. if (IsDedicatedExit) return false; auto *NewExitBB = SplitBlockPredecessors( BB, InLoopPredecessors, ".loopexit", DT, LI, PreserveLCSSA); if (!NewExitBB) DEBUG(dbgs() << "WARNING: Can't create a dedicated exit block for loop: " << *L << "\n"); else DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " << NewExitBB->getName() << "\n"); return true; }; // Walk the exit blocks directly rather than building up a data structure for // them, but only visit each one once. SmallPtrSet Visited; for (auto *BB : L->blocks()) for (auto *SuccBB : successors(BB)) { // We're looking for exit blocks so skip in-loop successors. if (L->contains(SuccBB)) continue; // Visit each exit block exactly once. if (!Visited.insert(SuccBB).second) continue; Changed |= RewriteExit(SuccBB); } return Changed; } /// \brief Returns the instructions that use values defined in the loop. SmallVector llvm::findDefsUsedOutsideOfLoop(Loop *L) { SmallVector UsedOutside; for (auto *Block : L->getBlocks()) // FIXME: I believe that this could use copy_if if the Inst reference could // be adapted into a pointer. for (auto &Inst : *Block) { auto Users = Inst.users(); if (any_of(Users, [&](User *U) { auto *Use = cast(U); return !L->contains(Use->getParent()); })) UsedOutside.push_back(&Inst); } return UsedOutside; } void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) { // By definition, all loop passes need the LoopInfo analysis and the // Dominator tree it depends on. Because they all participate in the loop // pass manager, they must also preserve these. AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); // We must also preserve LoopSimplify and LCSSA. We locally access their IDs // here because users shouldn't directly get them from this header. extern char &LoopSimplifyID; extern char &LCSSAID; AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); // This is used in the LPPassManager to perform LCSSA verification on passes // which preserve lcssa form AU.addRequired(); AU.addPreserved(); // Loop passes are designed to run inside of a loop pass manager which means // that any function analyses they require must be required by the first loop // pass in the manager (so that it is computed before the loop pass manager // runs) and preserved by all loop pasess in the manager. To make this // reasonably robust, the set needed for most loop passes is maintained here. // If your loop pass requires an analysis not listed here, you will need to // carefully audit the loop pass manager nesting structure that results. AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); } /// Manually defined generic "LoopPass" dependency initialization. This is used /// to initialize the exact set of passes from above in \c /// getLoopAnalysisUsage. It can be used within a loop pass's initialization /// with: /// /// INITIALIZE_PASS_DEPENDENCY(LoopPass) /// /// As-if "LoopPass" were a pass. void llvm::initializeLoopPassPass(PassRegistry &Registry) { INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) } /// \brief Find string metadata for loop /// /// If it has a value (e.g. {"llvm.distribute", 1} return the value as an /// operand or null otherwise. If the string metadata is not found return /// Optional's not-a-value. Optional llvm::findStringMetadataForLoop(Loop *TheLoop, StringRef Name) { MDNode *LoopID = TheLoop->getLoopID(); // Return none if LoopID is false. if (!LoopID) return None; // First operand should refer to the loop id itself. assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); // Iterate over LoopID operands and look for MDString Metadata for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { MDNode *MD = dyn_cast(LoopID->getOperand(i)); if (!MD) continue; MDString *S = dyn_cast(MD->getOperand(0)); if (!S) continue; // Return true if MDString holds expected MetaData. if (Name.equals(S->getString())) switch (MD->getNumOperands()) { case 1: return nullptr; case 2: return &MD->getOperand(1); default: llvm_unreachable("loop metadata has 0 or 1 operand"); } } return None; } /// Does a BFS from a given node to all of its children inside a given loop. /// The returned vector of nodes includes the starting point. SmallVector llvm::collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) { SmallVector Worklist; auto AddRegionToWorklist = [&](DomTreeNode *DTN) { // Only include subregions in the top level loop. BasicBlock *BB = DTN->getBlock(); if (CurLoop->contains(BB)) Worklist.push_back(DTN); }; AddRegionToWorklist(N); for (size_t I = 0; I < Worklist.size(); I++) for (DomTreeNode *Child : Worklist[I]->getChildren()) AddRegionToWorklist(Child); return Worklist; } void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr, LoopInfo *LI = nullptr) { assert((!DT || L->isLCSSAForm(*DT)) && "Expected LCSSA!"); auto *Preheader = L->getLoopPreheader(); assert(Preheader && "Preheader should exist!"); // Now that we know the removal is safe, remove the loop by changing the // branch from the preheader to go to the single exit block. // // Because we're deleting a large chunk of code at once, the sequence in which // we remove things is very important to avoid invalidation issues. // Tell ScalarEvolution that the loop is deleted. Do this before // deleting the loop so that ScalarEvolution can look at the loop // to determine what it needs to clean up. if (SE) SE->forgetLoop(L); auto *ExitBlock = L->getUniqueExitBlock(); assert(ExitBlock && "Should have a unique exit block!"); assert(L->hasDedicatedExits() && "Loop should have dedicated exits!"); auto *OldBr = dyn_cast(Preheader->getTerminator()); assert(OldBr && "Preheader must end with a branch"); assert(OldBr->isUnconditional() && "Preheader must have a single successor"); // Connect the preheader to the exit block. Keep the old edge to the header // around to perform the dominator tree update in two separate steps // -- #1 insertion of the edge preheader -> exit and #2 deletion of the edge // preheader -> header. // // // 0. Preheader 1. Preheader 2. Preheader // | | | | // V | V | // Header <--\ | Header <--\ | Header <--\ // | | | | | | | | | | | // | V | | | V | | | V | // | Body --/ | | Body --/ | | Body --/ // V V V V V // Exit Exit Exit // // By doing this is two separate steps we can perform the dominator tree // update without using the batch update API. // // Even when the loop is never executed, we cannot remove the edge from the // source block to the exit block. Consider the case where the unexecuted loop // branches back to an outer loop. If we deleted the loop and removed the edge // coming to this inner loop, this will break the outer loop structure (by // deleting the backedge of the outer loop). If the outer loop is indeed a // non-loop, it will be deleted in a future iteration of loop deletion pass. IRBuilder<> Builder(OldBr); Builder.CreateCondBr(Builder.getFalse(), L->getHeader(), ExitBlock); // Remove the old branch. The conditional branch becomes a new terminator. OldBr->eraseFromParent(); // Rewrite phis in the exit block to get their inputs from the Preheader // instead of the exiting block. BasicBlock::iterator BI = ExitBlock->begin(); while (PHINode *P = dyn_cast(BI)) { // Set the zero'th element of Phi to be from the preheader and remove all // other incoming values. Given the loop has dedicated exits, all other // incoming values must be from the exiting blocks. int PredIndex = 0; P->setIncomingBlock(PredIndex, Preheader); // Removes all incoming values from all other exiting blocks (including // duplicate values from an exiting block). // Nuke all entries except the zero'th entry which is the preheader entry. // NOTE! We need to remove Incoming Values in the reverse order as done // below, to keep the indices valid for deletion (removeIncomingValues // updates getNumIncomingValues and shifts all values down into the operand // being deleted). for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i) P->removeIncomingValue(e - i, false); assert((P->getNumIncomingValues() == 1 && P->getIncomingBlock(PredIndex) == Preheader) && "Should have exactly one value and that's from the preheader!"); ++BI; } // Disconnect the loop body by branching directly to its exit. Builder.SetInsertPoint(Preheader->getTerminator()); Builder.CreateBr(ExitBlock); // Remove the old branch. Preheader->getTerminator()->eraseFromParent(); if (DT) { // Update the dominator tree by informing it about the new edge from the // preheader to the exit. DT->insertEdge(Preheader, ExitBlock); // Inform the dominator tree about the removed edge. DT->deleteEdge(Preheader, L->getHeader()); } // Remove the block from the reference counting scheme, so that we can // delete it freely later. for (auto *Block : L->blocks()) Block->dropAllReferences(); if (LI) { // Erase the instructions and the blocks without having to worry // about ordering because we already dropped the references. // NOTE: This iteration is safe because erasing the block does not remove // its entry from the loop's block list. We do that in the next section. for (Loop::block_iterator LpI = L->block_begin(), LpE = L->block_end(); LpI != LpE; ++LpI) (*LpI)->eraseFromParent(); // Finally, the blocks from loopinfo. This has to happen late because // otherwise our loop iterators won't work. SmallPtrSet blocks; blocks.insert(L->block_begin(), L->block_end()); for (BasicBlock *BB : blocks) LI->removeBlock(BB); // The last step is to update LoopInfo now that we've eliminated this loop. LI->erase(L); } } /// Returns true if the instruction in a loop is guaranteed to execute at least /// once. bool llvm::isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo) { // We have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop // which does not execute this instruction, so we can't hoist it. // If the instruction is in the header block for the loop (which is very // common), it is always guaranteed to dominate the exit blocks. Since this // is a common case, and can save some work, check it now. if (Inst.getParent() == CurLoop->getHeader()) // If there's a throw in the header block, we can't guarantee we'll reach // Inst. return !SafetyInfo->HeaderMayThrow; // Somewhere in this loop there is an instruction which may throw and make us // exit the loop. if (SafetyInfo->MayThrow) return false; // Get the exit blocks for the current loop. SmallVector ExitBlocks; CurLoop->getExitBlocks(ExitBlocks); // Verify that the block dominates each of the exit blocks of the loop. for (BasicBlock *ExitBlock : ExitBlocks) if (!DT->dominates(Inst.getParent(), ExitBlock)) return false; // As a degenerate case, if the loop is statically infinite then we haven't // proven anything since there are no exit blocks. if (ExitBlocks.empty()) return false; // FIXME: In general, we have to prove that the loop isn't an infinite loop. // See http::llvm.org/PR24078 . (The "ExitBlocks.empty()" check above is // just a special case of this.) return true; } Optional llvm::getLoopEstimatedTripCount(Loop *L) { // Only support loops with a unique exiting block, and a latch. if (!L->getExitingBlock()) return None; // Get the branch weights for the the loop's backedge. BranchInst *LatchBR = dyn_cast(L->getLoopLatch()->getTerminator()); if (!LatchBR || LatchBR->getNumSuccessors() != 2) return None; assert((LatchBR->getSuccessor(0) == L->getHeader() || LatchBR->getSuccessor(1) == L->getHeader()) && "At least one edge out of the latch must go to the header"); // To estimate the number of times the loop body was executed, we want to // know the number of times the backedge was taken, vs. the number of times // we exited the loop. uint64_t TrueVal, FalseVal; if (!LatchBR->extractProfMetadata(TrueVal, FalseVal)) return None; if (!TrueVal || !FalseVal) return 0; // Divide the count of the backedge by the count of the edge exiting the loop, // rounding to nearest. if (LatchBR->getSuccessor(0) == L->getHeader()) return (TrueVal + (FalseVal / 2)) / FalseVal; else return (FalseVal + (TrueVal / 2)) / TrueVal; } /// \brief Adds a 'fast' flag to floating point operations. static Value *addFastMathFlag(Value *V) { if (isa(V)) { FastMathFlags Flags; Flags.setFast(); cast(V)->setFastMathFlags(Flags); } return V; } // Helper to generate a log2 shuffle reduction. Value * llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, ArrayRef RedOps) { unsigned VF = Src->getType()->getVectorNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each // round. assert(isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"); Value *TmpVec = Src; SmallVector ShuffleMask(VF, nullptr); for (unsigned i = VF; i != 1; i >>= 1) { // Move the upper half of the vector to the lower half. for (unsigned j = 0; j != i / 2; ++j) ShuffleMask[j] = Builder.getInt32(i / 2 + j); // Fill the rest of the mask with undef. std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), UndefValue::get(Builder.getInt32Ty())); Value *Shuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), ConstantVector::get(ShuffleMask), "rdx.shuf"); if (Op != Instruction::ICmp && Op != Instruction::FCmp) { // Floating point operations had to be 'fast' to enable the reduction. TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); } else { assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && "Invalid min/max"); TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec, Shuf); } if (!RedOps.empty()) propagateIRFlags(TmpVec, RedOps); } // The result is in the first element of the vector. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } /// Create a simple vector reduction specified by an opcode and some /// flags (if generating min/max reductions). Value *llvm::createSimpleTargetReduction( IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode, Value *Src, TargetTransformInfo::ReductionFlags Flags, ArrayRef RedOps) { assert(isa(Src->getType()) && "Type must be a vector"); Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType()); std::function BuildFunc; using RD = RecurrenceDescriptor; RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid; // TODO: Support creating ordered reductions. FastMathFlags FMFFast; FMFFast.setFast(); switch (Opcode) { case Instruction::Add: BuildFunc = [&]() { return Builder.CreateAddReduce(Src); }; break; case Instruction::Mul: BuildFunc = [&]() { return Builder.CreateMulReduce(Src); }; break; case Instruction::And: BuildFunc = [&]() { return Builder.CreateAndReduce(Src); }; break; case Instruction::Or: BuildFunc = [&]() { return Builder.CreateOrReduce(Src); }; break; case Instruction::Xor: BuildFunc = [&]() { return Builder.CreateXorReduce(Src); }; break; case Instruction::FAdd: BuildFunc = [&]() { auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src); cast(Rdx)->setFastMathFlags(FMFFast); return Rdx; }; break; case Instruction::FMul: BuildFunc = [&]() { auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src); cast(Rdx)->setFastMathFlags(FMFFast); return Rdx; }; break; case Instruction::ICmp: if (Flags.IsMaxOp) { MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax; BuildFunc = [&]() { return Builder.CreateIntMaxReduce(Src, Flags.IsSigned); }; } else { MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin; BuildFunc = [&]() { return Builder.CreateIntMinReduce(Src, Flags.IsSigned); }; } break; case Instruction::FCmp: if (Flags.IsMaxOp) { MinMaxKind = RD::MRK_FloatMax; BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); }; } else { MinMaxKind = RD::MRK_FloatMin; BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); }; } break; default: llvm_unreachable("Unhandled opcode"); break; } if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) return BuildFunc(); return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); } /// Create a vector reduction using a given recurrence descriptor. Value *llvm::createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI, RecurrenceDescriptor &Desc, Value *Src, bool NoNaN) { // TODO: Support in-order reductions based on the recurrence descriptor. using RD = RecurrenceDescriptor; RD::RecurrenceKind RecKind = Desc.getRecurrenceKind(); TargetTransformInfo::ReductionFlags Flags; Flags.NoNaN = NoNaN; switch (RecKind) { case RD::RK_FloatAdd: return createSimpleTargetReduction(B, TTI, Instruction::FAdd, Src, Flags); case RD::RK_FloatMult: return createSimpleTargetReduction(B, TTI, Instruction::FMul, Src, Flags); case RD::RK_IntegerAdd: return createSimpleTargetReduction(B, TTI, Instruction::Add, Src, Flags); case RD::RK_IntegerMult: return createSimpleTargetReduction(B, TTI, Instruction::Mul, Src, Flags); case RD::RK_IntegerAnd: return createSimpleTargetReduction(B, TTI, Instruction::And, Src, Flags); case RD::RK_IntegerOr: return createSimpleTargetReduction(B, TTI, Instruction::Or, Src, Flags); case RD::RK_IntegerXor: return createSimpleTargetReduction(B, TTI, Instruction::Xor, Src, Flags); case RD::RK_IntegerMinMax: { RD::MinMaxRecurrenceKind MMKind = Desc.getMinMaxRecurrenceKind(); Flags.IsMaxOp = (MMKind == RD::MRK_SIntMax || MMKind == RD::MRK_UIntMax); Flags.IsSigned = (MMKind == RD::MRK_SIntMax || MMKind == RD::MRK_SIntMin); return createSimpleTargetReduction(B, TTI, Instruction::ICmp, Src, Flags); } case RD::RK_FloatMinMax: { Flags.IsMaxOp = Desc.getMinMaxRecurrenceKind() == RD::MRK_FloatMax; return createSimpleTargetReduction(B, TTI, Instruction::FCmp, Src, Flags); } default: llvm_unreachable("Unhandled RecKind"); } } void llvm::propagateIRFlags(Value *I, ArrayRef VL, Value *OpValue) { auto *VecOp = dyn_cast(I); if (!VecOp) return; auto *Intersection = (OpValue == nullptr) ? dyn_cast(VL[0]) : dyn_cast(OpValue); if (!Intersection) return; const unsigned Opcode = Intersection->getOpcode(); VecOp->copyIRFlags(Intersection); for (auto *V : VL) { auto *Instr = dyn_cast(V); if (!Instr) continue; if (OpValue == nullptr || Opcode == Instr->getOpcode()) VecOp->andIRFlags(V); } } Index: vendor/llvm/dist/test/CodeGen/X86/avx512-calling-conv.ll =================================================================== --- vendor/llvm/dist/test/CodeGen/X86/avx512-calling-conv.ll (revision 327151) +++ vendor/llvm/dist/test/CodeGen/X86/avx512-calling-conv.ll (revision 327152) @@ -1,425 +1,412 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=KNL_X32 define <16 x i1> @test1() { ; ALL_X64-LABEL: test1: ; ALL_X64: ## %bb.0: ; ALL_X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test1: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; KNL_X32-NEXT: retl ret <16 x i1> zeroinitializer } define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { ; ALL_X64-LABEL: test2: ; ALL_X64: ## %bb.0: ; ALL_X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL_X64-NEXT: vpsllw $7, %xmm0, %xmm0 ; ALL_X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; ALL_X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL_X64-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test2: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL_X32-NEXT: vpsllw $7, %xmm0, %xmm0 ; KNL_X32-NEXT: vpand LCPI1_0, %xmm0, %xmm0 ; KNL_X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL_X32-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; KNL_X32-NEXT: retl %c = and <16 x i1>%a, %b ret <16 x i1> %c } define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; ALL_X64-LABEL: test3: ; ALL_X64: ## %bb.0: ; ALL_X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL_X64-NEXT: vpsllw $15, %xmm0, %xmm0 ; ALL_X64-NEXT: vpsraw $15, %xmm0, %xmm0 ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test3: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL_X32-NEXT: vpsllw $15, %xmm0, %xmm0 ; KNL_X32-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL_X32-NEXT: retl %c = and <8 x i1>%a, %b ret <8 x i1> %c } define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) { ; KNL-LABEL: test4: ; KNL: ## %bb.0: ; KNL-NEXT: vandps %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test4: ; SKX: ## %bb.0: ; SKX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ; SKX-NEXT: vpsrad $31, %xmm0, %xmm0 ; SKX-NEXT: retq ; ; KNL_X32-LABEL: test4: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: vandps %xmm1, %xmm0, %xmm0 ; KNL_X32-NEXT: retl %c = and <4 x i1>%a, %b ret <4 x i1> %c } declare <8 x i1> @func8xi1(<8 x i1> %a) define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) { ; KNL-LABEL: test5: ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 ; KNL-NEXT: callq _func8xi1 ; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL-NEXT: vpslld $31, %ymm0, %ymm0 ; KNL-NEXT: vpsrad $31, %ymm0, %ymm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq ; ; SKX-LABEL: test5: ; SKX: ## %bb.0: ; SKX-NEXT: pushq %rax ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _func8xi1 ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX-NEXT: vpslld $31, %ymm0, %ymm0 ; SKX-NEXT: vpsrad $31, %ymm0, %ymm0 ; SKX-NEXT: popq %rax ; SKX-NEXT: retq ; ; KNL_X32-LABEL: test5: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: subl $12, %esp ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 ; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 ; KNL_X32-NEXT: calll _func8xi1 ; KNL_X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL_X32-NEXT: vpslld $31, %ymm0, %ymm0 ; KNL_X32-NEXT: vpsrad $31, %ymm0, %ymm0 ; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: retl %cmpRes = icmp sgt <8 x i32>%a, %b %resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes) %res = sext <8 x i1>%resi to <8 x i32> ret <8 x i32> %res } declare <16 x i1> @func16xi1(<16 x i1> %a) define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { ; KNL-LABEL: test6: ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: callq _func16xi1 ; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vpsrad $31, %zmm0, %zmm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq ; ; SKX-LABEL: test6: ; SKX: ## %bb.0: ; SKX-NEXT: pushq %rax ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _func16xi1 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; SKX-NEXT: vpslld $31, %zmm0, %zmm0 ; SKX-NEXT: vpsrad $31, %zmm0, %zmm0 ; SKX-NEXT: popq %rax ; SKX-NEXT: retq ; ; KNL_X32-LABEL: test6: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: subl $12, %esp ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 ; KNL_X32-NEXT: calll _func16xi1 ; KNL_X32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL_X32-NEXT: vpsrad $31, %zmm0, %zmm0 ; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: retl %cmpRes = icmp sgt <16 x i32>%a, %b %resi = call <16 x i1> @func16xi1(<16 x i1> %cmpRes) %res = sext <16 x i1>%resi to <16 x i32> ret <16 x i32> %res } declare <4 x i1> @func4xi1(<4 x i1> %a) define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) { ; KNL-LABEL: test7: ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: callq _func4xi1 ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq ; ; SKX-LABEL: test7: ; SKX: ## %bb.0: ; SKX-NEXT: pushq %rax ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: callq _func4xi1 ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ; SKX-NEXT: vpsrad $31, %xmm0, %xmm0 ; SKX-NEXT: popq %rax ; SKX-NEXT: retq ; ; KNL_X32-LABEL: test7: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: subl $12, %esp ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; KNL_X32-NEXT: calll _func4xi1 ; KNL_X32-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_X32-NEXT: vpsrad $31, %xmm0, %xmm0 ; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: retl %cmpRes = icmp sgt <4 x i32>%a, %b %resi = call <4 x i1> @func4xi1(<4 x i1> %cmpRes) %res = sext <4 x i1>%resi to <4 x i32> ret <4 x i32> %res } define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL-LABEL: test7a: ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 ; KNL-NEXT: callq _func8xi1 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL-NEXT: movb $85, %al -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 +; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq ; ; SKX-LABEL: test7a: ; SKX: ## %bb.0: ; SKX-NEXT: pushq %rax ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _func8xi1 +; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k0 -; SKX-NEXT: movb $85, %al -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kandb %k1, %k0, %k0 -; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: vpsraw $15, %xmm0, %xmm0 ; SKX-NEXT: popq %rax ; SKX-NEXT: retq ; ; KNL_X32-LABEL: test7a: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: subl $12, %esp ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 ; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 ; KNL_X32-NEXT: calll _func8xi1 -; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL_X32-NEXT: movb $85, %al -; KNL_X32-NEXT: kmovw %eax, %k1 -; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 -; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL_X32-NEXT: vpand LCPI7_0, %xmm0, %xmm0 +; KNL_X32-NEXT: vpsllw $15, %xmm0, %xmm0 +; KNL_X32-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: retl %cmpRes = icmp sgt <8 x i32>%a, %b %resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes) %res = and <8 x i1>%resi, ret <8 x i1> %res } define <16 x i8> @test8(<16 x i8> %a1, <16 x i8> %a2, i1 %cond) { ; ALL_X64-LABEL: test8: ; ALL_X64: ## %bb.0: ; ALL_X64-NEXT: testb $1, %dil ; ALL_X64-NEXT: jne LBB8_2 ; ALL_X64-NEXT: ## %bb.1: ; ALL_X64-NEXT: vmovaps %xmm1, %xmm0 ; ALL_X64-NEXT: LBB8_2: ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test8: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: jne LBB8_2 ; KNL_X32-NEXT: ## %bb.1: ; KNL_X32-NEXT: vmovaps %xmm1, %xmm0 ; KNL_X32-NEXT: LBB8_2: ; KNL_X32-NEXT: retl %res = select i1 %cond, <16 x i8> %a1, <16 x i8> %a2 ret <16 x i8> %res } define i1 @test9(double %a, double %b) { ; ALL_X64-LABEL: test9: ; ALL_X64: ## %bb.0: ; ALL_X64-NEXT: vucomisd %xmm0, %xmm1 ; ALL_X64-NEXT: setb %al ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test9: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; KNL_X32-NEXT: vucomisd {{[0-9]+}}(%esp), %xmm0 ; KNL_X32-NEXT: setb %al ; KNL_X32-NEXT: retl %c = fcmp ugt double %a, %b ret i1 %c } define i32 @test10(i32 %a, i32 %b, i1 %cond) { ; ALL_X64-LABEL: test10: ; ALL_X64: ## %bb.0: ; ALL_X64-NEXT: testb $1, %dl ; ALL_X64-NEXT: cmovel %esi, %edi ; ALL_X64-NEXT: movl %edi, %eax ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test10: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: leal {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: leal {{[0-9]+}}(%esp), %ecx ; KNL_X32-NEXT: cmovnel %eax, %ecx ; KNL_X32-NEXT: movl (%ecx), %eax ; KNL_X32-NEXT: retl %c = select i1 %cond, i32 %a, i32 %b ret i32 %c } define i1 @test11(i32 %a, i32 %b) { ; ALL_X64-LABEL: test11: ; ALL_X64: ## %bb.0: ; ALL_X64-NEXT: cmpl %esi, %edi ; ALL_X64-NEXT: setg %al ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test11: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: setg %al ; KNL_X32-NEXT: retl %c = icmp sgt i32 %a, %b ret i1 %c } define i32 @test12(i32 %a1, i32 %a2, i32 %b1) { ; ALL_X64-LABEL: test12: ; ALL_X64: ## %bb.0: ; ALL_X64-NEXT: pushq %rbp ; ALL_X64-NEXT: .cfi_def_cfa_offset 16 ; ALL_X64-NEXT: pushq %r14 ; ALL_X64-NEXT: .cfi_def_cfa_offset 24 ; ALL_X64-NEXT: pushq %rbx ; ALL_X64-NEXT: .cfi_def_cfa_offset 32 ; ALL_X64-NEXT: .cfi_offset %rbx, -32 ; ALL_X64-NEXT: .cfi_offset %r14, -24 ; ALL_X64-NEXT: .cfi_offset %rbp, -16 ; ALL_X64-NEXT: movl %esi, %r14d ; ALL_X64-NEXT: movl %edi, %ebp ; ALL_X64-NEXT: movl %edx, %esi ; ALL_X64-NEXT: callq _test11 ; ALL_X64-NEXT: movzbl %al, %ebx ; ALL_X64-NEXT: movl %ebp, %edi ; ALL_X64-NEXT: movl %r14d, %esi ; ALL_X64-NEXT: movl %ebx, %edx ; ALL_X64-NEXT: callq _test10 ; ALL_X64-NEXT: xorl %ecx, %ecx ; ALL_X64-NEXT: testb $1, %bl ; ALL_X64-NEXT: cmovel %ecx, %eax ; ALL_X64-NEXT: popq %rbx ; ALL_X64-NEXT: popq %r14 ; ALL_X64-NEXT: popq %rbp ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test12: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: pushl %ebx ; KNL_X32-NEXT: .cfi_def_cfa_offset 8 ; KNL_X32-NEXT: pushl %edi ; KNL_X32-NEXT: .cfi_def_cfa_offset 12 ; KNL_X32-NEXT: pushl %esi ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: subl $16, %esp ; KNL_X32-NEXT: .cfi_def_cfa_offset 32 ; KNL_X32-NEXT: .cfi_offset %esi, -16 ; KNL_X32-NEXT: .cfi_offset %edi, -12 ; KNL_X32-NEXT: .cfi_offset %ebx, -8 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl %edi, (%esp) ; KNL_X32-NEXT: calll _test11 ; KNL_X32-NEXT: movl %eax, %ebx ; KNL_X32-NEXT: movzbl %bl, %eax ; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl %esi, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl %edi, (%esp) ; KNL_X32-NEXT: calll _test10 ; KNL_X32-NEXT: xorl %ecx, %ecx ; KNL_X32-NEXT: testb $1, %bl ; KNL_X32-NEXT: cmovel %ecx, %eax ; KNL_X32-NEXT: addl $16, %esp ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: popl %edi ; KNL_X32-NEXT: popl %ebx ; KNL_X32-NEXT: retl %cond = call i1 @test11(i32 %a1, i32 %b1) %res = call i32 @test10(i32 %a1, i32 %a2, i1 %cond) %res1 = select i1 %cond, i32 %res, i32 0 ret i32 %res1 } Index: vendor/llvm/dist/test/CodeGen/X86/avx512-cvt.ll =================================================================== --- vendor/llvm/dist/test/CodeGen/X86/avx512-cvt.ll (revision 327151) +++ vendor/llvm/dist/test/CodeGen/X86/avx512-cvt.ll (revision 327152) @@ -1,2032 +1,2020 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW define <16 x float> @sitof32(<16 x i32> %a) nounwind { ; ALL-LABEL: sitof32: ; ALL: # %bb.0: ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %b = sitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } define <8 x double> @sltof864(<8 x i64> %a) { ; NODQ-LABEL: sltof864: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; NODQ-NEXT: retq ; ; DQ-LABEL: sltof864: ; DQ: # %bb.0: ; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; DQ-NEXT: retq %b = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } define <4 x double> @slto4f64(<4 x i64> %a) { ; NODQ-LABEL: slto4f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq ; ; VLDQ-LABEL: slto4f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0 ; VLDQ-NEXT: retq ; ; AVX512DQ-LABEL: slto4f64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; AVX512DQ-NEXT: retq %b = sitofp <4 x i64> %a to <4 x double> ret <4 x double> %b } define <2 x double> @slto2f64(<2 x i64> %a) { ; NODQ-LABEL: slto2f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; NODQ-NEXT: retq ; ; VLDQ-LABEL: slto2f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; AVX512DQ-LABEL: slto2f64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %b = sitofp <2 x i64> %a to <2 x double> ret <2 x double> %b } define <2 x float> @sltof2f32(<2 x i64> %a) { ; NODQ-LABEL: sltof2f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; NODQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; NODQ-NEXT: retq ; ; VLDQ-LABEL: sltof2f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; AVX512DQ-LABEL: sltof2f32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %b = sitofp <2 x i64> %a to <2 x float> ret <2 x float>%b } define <4 x float> @slto4f32_mem(<4 x i64>* %a) { ; NODQ-LABEL: slto4f32_mem: ; NODQ: # %bb.0: ; NODQ-NEXT: vmovdqu (%rdi), %ymm0 ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NODQ-NEXT: vzeroupper ; NODQ-NEXT: retq ; ; VLDQ-LABEL: slto4f32_mem: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 ; VLDQ-NEXT: retq ; ; AVX512DQ-LABEL: slto4f32_mem: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %a1 = load <4 x i64>, <4 x i64>* %a, align 8 %b = sitofp <4 x i64> %a1 to <4 x float> ret <4 x float>%b } define <4 x i64> @f64to4sl(<4 x double> %a) { ; NODQ-LABEL: f64to4sl: ; NODQ: # %bb.0: ; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1 ; NODQ-NEXT: vcvttsd2si %xmm1, %rax ; NODQ-NEXT: vmovq %rax, %xmm2 ; NODQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; NODQ-NEXT: vcvttsd2si %xmm1, %rax ; NODQ-NEXT: vmovq %rax, %xmm1 ; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; NODQ-NEXT: vcvttsd2si %xmm0, %rax ; NODQ-NEXT: vmovq %rax, %xmm2 ; NODQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; NODQ-NEXT: vcvttsd2si %xmm0, %rax ; NODQ-NEXT: vmovq %rax, %xmm0 ; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq ; ; VLDQ-LABEL: f64to4sl: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0 ; VLDQ-NEXT: retq ; ; AVX512DQ-LABEL: f64to4sl: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; AVX512DQ-NEXT: retq %b = fptosi <4 x double> %a to <4 x i64> ret <4 x i64> %b } define <4 x i64> @f32to4sl(<4 x float> %a) { ; NODQ-LABEL: f32to4sl: ; NODQ: # %bb.0: ; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] ; NODQ-NEXT: vcvttss2si %xmm1, %rax ; NODQ-NEXT: vmovq %rax, %xmm1 ; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; NODQ-NEXT: vcvttss2si %xmm2, %rax ; NODQ-NEXT: vmovq %rax, %xmm2 ; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; NODQ-NEXT: vcvttss2si %xmm0, %rax ; NODQ-NEXT: vmovq %rax, %xmm2 ; NODQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; NODQ-NEXT: vcvttss2si %xmm0, %rax ; NODQ-NEXT: vmovq %rax, %xmm0 ; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq ; ; VLDQ-LABEL: f32to4sl: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 ; VLDQ-NEXT: retq ; ; AVX512DQ-LABEL: f32to4sl: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 ; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; AVX512DQ-NEXT: retq %b = fptosi <4 x float> %a to <4 x i64> ret <4 x i64> %b } define <4 x float> @slto4f32(<4 x i64> %a) { ; NODQ-LABEL: slto4f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NODQ-NEXT: vzeroupper ; NODQ-NEXT: retq ; ; VLDQ-LABEL: slto4f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 ; VLDQ-NEXT: vzeroupper ; VLDQ-NEXT: retq ; ; AVX512DQ-LABEL: slto4f32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %b = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } define <4 x float> @ulto4f32(<4 x i64> %a) { ; NODQ-LABEL: ulto4f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NODQ-NEXT: vzeroupper ; NODQ-NEXT: retq ; ; VLDQ-LABEL: ulto4f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 ; VLDQ-NEXT: vzeroupper ; VLDQ-NEXT: retq ; ; AVX512DQ-LABEL: ulto4f32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %b = uitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } define <8 x double> @ulto8f64(<8 x i64> %a) { ; NODQ-LABEL: ulto8f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm0 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; NODQ-NEXT: retq ; ; DQ-LABEL: ulto8f64: ; DQ: # %bb.0: ; DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; DQ-NEXT: retq %b = uitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } define <16 x double> @ulto16f64(<16 x i64> %a) { ; NODQ-LABEL: ulto16f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 ; NODQ-NEXT: retq ; ; DQ-LABEL: ulto16f64: ; DQ: # %bb.0: ; DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1 ; DQ-NEXT: retq %b = uitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } define <16 x i32> @f64to16si(<16 x float> %a) nounwind { ; ALL-LABEL: f64to16si: ; ALL: # %bb.0: ; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 ; ALL-NEXT: retq %b = fptosi <16 x float> %a to <16 x i32> ret <16 x i32> %b } define <16 x i8> @f32to16sc(<16 x float> %f) { ; ALL-LABEL: f32to16sc: ; ALL: # %bb.0: ; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 ; ALL-NEXT: vpmovdb %zmm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %res = fptosi <16 x float> %f to <16 x i8> ret <16 x i8> %res } define <16 x i16> @f32to16ss(<16 x float> %f) { ; ALL-LABEL: f32to16ss: ; ALL: # %bb.0: ; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 ; ALL-NEXT: vpmovdw %zmm0, %ymm0 ; ALL-NEXT: retq %res = fptosi <16 x float> %f to <16 x i16> ret <16 x i16> %res } define <16 x i32> @f32to16ui(<16 x float> %a) nounwind { ; ALL-LABEL: f32to16ui: ; ALL: # %bb.0: ; ALL-NEXT: vcvttps2udq %zmm0, %zmm0 ; ALL-NEXT: retq %b = fptoui <16 x float> %a to <16 x i32> ret <16 x i32> %b } define <16 x i8> @f32to16uc(<16 x float> %f) { ; ALL-LABEL: f32to16uc: ; ALL: # %bb.0: ; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 ; ALL-NEXT: vpmovdb %zmm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %res = fptoui <16 x float> %f to <16 x i8> ret <16 x i8> %res } define <16 x i16> @f32to16us(<16 x float> %f) { ; ALL-LABEL: f32to16us: ; ALL: # %bb.0: ; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 ; ALL-NEXT: vpmovdw %zmm0, %ymm0 ; ALL-NEXT: retq %res = fptoui <16 x float> %f to <16 x i16> ret <16 x i16> %res } define <8 x i32> @f32to8ui(<8 x float> %a) nounwind { ; NOVL-LABEL: f32to8ui: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 ; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; NOVL-NEXT: retq ; ; VL-LABEL: f32to8ui: ; VL: # %bb.0: ; VL-NEXT: vcvttps2udq %ymm0, %ymm0 ; VL-NEXT: retq %b = fptoui <8 x float> %a to <8 x i32> ret <8 x i32> %b } define <4 x i32> @f32to4ui(<4 x float> %a) nounwind { ; NOVL-LABEL: f32to4ui: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 ; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 ; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f32to4ui: ; VL: # %bb.0: ; VL-NEXT: vcvttps2udq %xmm0, %xmm0 ; VL-NEXT: retq %b = fptoui <4 x float> %a to <4 x i32> ret <4 x i32> %b } define <8 x i32> @f64to8ui(<8 x double> %a) nounwind { ; ALL-LABEL: f64to8ui: ; ALL: # %bb.0: ; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0 ; ALL-NEXT: retq %b = fptoui <8 x double> %a to <8 x i32> ret <8 x i32> %b } define <8 x i16> @f64to8us(<8 x double> %f) { ; NOVL-LABEL: f64to8us: ; NOVL: # %bb.0: ; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 ; NOVL-NEXT: vpmovdw %zmm0, %ymm0 ; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to8us: ; VL: # %bb.0: ; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 ; VL-NEXT: vpmovdw %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq %res = fptoui <8 x double> %f to <8 x i16> ret <8 x i16> %res } define <8 x i8> @f64to8uc(<8 x double> %f) { ; NOVL-LABEL: f64to8uc: ; NOVL: # %bb.0: ; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 ; NOVL-NEXT: vpmovdw %zmm0, %ymm0 ; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to8uc: ; VL: # %bb.0: ; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 ; VL-NEXT: vpmovdw %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq %res = fptoui <8 x double> %f to <8 x i8> ret <8 x i8> %res } define <4 x i32> @f64to4ui(<4 x double> %a) nounwind { ; NOVL-LABEL: f64to4ui: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 ; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to4ui: ; VL: # %bb.0: ; VL-NEXT: vcvttpd2udq %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq %b = fptoui <4 x double> %a to <4 x i32> ret <4 x i32> %b } define <8 x double> @sito8f64(<8 x i32> %a) { ; ALL-LABEL: sito8f64: ; ALL: # %bb.0: ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq %b = sitofp <8 x i32> %a to <8 x double> ret <8 x double> %b } define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { ; KNL-LABEL: i32to8f64_mask: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; VLBW-LABEL: i32to8f64_mask: ; VLBW: # %bb.0: ; VLBW-NEXT: kmovd %edi, %k1 ; VLBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} ; VLBW-NEXT: retq ; ; VLNOBW-LABEL: i32to8f64_mask: ; VLNOBW: # %bb.0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} ; VLNOBW-NEXT: retq ; ; AVX512DQ-LABEL: i32to8f64_mask: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: i32to8f64_mask: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq %1 = bitcast i8 %c to <8 x i1> %2 = sitofp <8 x i32> %b to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a ret <8 x double> %3 } define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { ; KNL-LABEL: sito8f64_maskz: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; VLBW-LABEL: sito8f64_maskz: ; VLBW: # %bb.0: ; VLBW-NEXT: kmovd %edi, %k1 ; VLBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} ; VLBW-NEXT: retq ; ; VLNOBW-LABEL: sito8f64_maskz: ; VLNOBW: # %bb.0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} ; VLNOBW-NEXT: retq ; ; AVX512DQ-LABEL: sito8f64_maskz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: sito8f64_maskz: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq %1 = bitcast i8 %b to <8 x i1> %2 = sitofp <8 x i32> %a to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer ret <8 x double> %3 } define <8 x i32> @f64to8si(<8 x double> %a) { ; ALL-LABEL: f64to8si: ; ALL: # %bb.0: ; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0 ; ALL-NEXT: retq %b = fptosi <8 x double> %a to <8 x i32> ret <8 x i32> %b } define <4 x i32> @f64to4si(<4 x double> %a) { ; ALL-LABEL: f64to4si: ; ALL: # %bb.0: ; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %b = fptosi <4 x double> %a to <4 x i32> ret <4 x i32> %b } define <16 x float> @f64to16f32(<16 x double> %b) nounwind { ; ALL-LABEL: f64to16f32: ; ALL: # %bb.0: ; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0 ; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %a = fptrunc <16 x double> %b to <16 x float> ret <16 x float> %a } define <4 x float> @f64to4f32(<4 x double> %b) { ; ALL-LABEL: f64to4f32: ; ALL: # %bb.0: ; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %a = fptrunc <4 x double> %b to <4 x float> ret <4 x float> %a } define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) { ; NOVL-LABEL: f64to4f32_mask: ; NOVL: # %bb.0: ; NOVL-NEXT: vpslld $31, %xmm1, %xmm1 ; NOVL-NEXT: vpsrad $31, %xmm1, %xmm1 ; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0 ; NOVL-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to4f32_mask: ; VL: # %bb.0: ; VL-NEXT: vpslld $31, %xmm1, %xmm1 ; VL-NEXT: vptestmd %xmm1, %xmm1, %k1 ; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} ; VL-NEXT: vzeroupper ; VL-NEXT: retq %a = fptrunc <4 x double> %b to <4 x float> %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer ret <4 x float> %c } define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { ; ALL-LABEL: f64tof32_inreg: ; ALL: # %bb.0: ; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 ; ALL-NEXT: retq %ext = extractelement <2 x double> %a0, i32 0 %cvt = fptrunc double %ext to float %res = insertelement <4 x float> %a1, float %cvt, i32 0 ret <4 x float> %res } define <8 x double> @f32to8f64(<8 x float> %b) nounwind { ; ALL-LABEL: f32to8f64: ; ALL: # %bb.0: ; ALL-NEXT: vcvtps2pd %ymm0, %zmm0 ; ALL-NEXT: retq %a = fpext <8 x float> %b to <8 x double> ret <8 x double> %a } define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) { ; NOVL-LABEL: f32to4f64_mask: ; NOVL: # %bb.0: ; NOVL-NEXT: vcvtps2pd %xmm0, %ymm0 ; NOVL-NEXT: vcmpltpd %ymm2, %ymm1, %ymm1 ; NOVL-NEXT: vandpd %ymm0, %ymm1, %ymm0 ; NOVL-NEXT: retq ; ; VL-LABEL: f32to4f64_mask: ; VL: # %bb.0: ; VL-NEXT: vcmpltpd %ymm2, %ymm1, %k1 ; VL-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} ; VL-NEXT: retq %a = fpext <4 x float> %b to <4 x double> %mask = fcmp ogt <4 x double> %a1, %b1 %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer ret <4 x double> %c } define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { ; ALL-LABEL: f32tof64_inreg: ; ALL: # %bb.0: ; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 ; ALL-NEXT: retq %ext = extractelement <4 x float> %a1, i32 0 %cvt = fpext float %ext to double %res = insertelement <2 x double> %a0, double %cvt, i32 0 ret <2 x double> %res } define double @sltof64_load(i64* nocapture %e) { ; ALL-LABEL: sltof64_load: ; ALL: # %bb.0: # %entry ; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: %tmp1 = load i64, i64* %e, align 8 %conv = sitofp i64 %tmp1 to double ret double %conv } define double @sitof64_load(i32* %e) { ; ALL-LABEL: sitof64_load: ; ALL: # %bb.0: # %entry ; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: %tmp1 = load i32, i32* %e, align 4 %conv = sitofp i32 %tmp1 to double ret double %conv } define float @sitof32_load(i32* %e) { ; ALL-LABEL: sitof32_load: ; ALL: # %bb.0: # %entry ; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: %tmp1 = load i32, i32* %e, align 4 %conv = sitofp i32 %tmp1 to float ret float %conv } define float @sltof32_load(i64* %e) { ; ALL-LABEL: sltof32_load: ; ALL: # %bb.0: # %entry ; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: %tmp1 = load i64, i64* %e, align 8 %conv = sitofp i64 %tmp1 to float ret float %conv } define void @f32tof64_loadstore() { ; ALL-LABEL: f32tof64_loadstore: ; ALL: # %bb.0: # %entry ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: retq entry: %f = alloca float, align 4 %d = alloca double, align 8 %tmp = load float, float* %f, align 4 %conv = fpext float %tmp to double store double %conv, double* %d, align 8 ret void } define void @f64tof32_loadstore() nounwind uwtable { ; ALL-LABEL: f64tof32_loadstore: ; ALL: # %bb.0: # %entry ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: retq entry: %f = alloca float, align 4 %d = alloca double, align 8 %tmp = load double, double* %d, align 8 %conv = fptrunc double %tmp to float store float %conv, float* %f, align 4 ret void } define double @long_to_double(i64 %x) { ; ALL-LABEL: long_to_double: ; ALL: # %bb.0: ; ALL-NEXT: vmovq %rdi, %xmm0 ; ALL-NEXT: retq %res = bitcast i64 %x to double ret double %res } define i64 @double_to_long(double %x) { ; ALL-LABEL: double_to_long: ; ALL: # %bb.0: ; ALL-NEXT: vmovq %xmm0, %rax ; ALL-NEXT: retq %res = bitcast double %x to i64 ret i64 %res } define float @int_to_float(i32 %x) { ; ALL-LABEL: int_to_float: ; ALL: # %bb.0: ; ALL-NEXT: vmovd %edi, %xmm0 ; ALL-NEXT: retq %res = bitcast i32 %x to float ret float %res } define i32 @float_to_int(float %x) { ; ALL-LABEL: float_to_int: ; ALL: # %bb.0: ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: retq %res = bitcast float %x to i32 ret i32 %res } define <16 x double> @uito16f64(<16 x i32> %a) nounwind { ; ALL-LABEL: uito16f64: ; ALL: # %bb.0: ; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2 ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1 ; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %b = uitofp <16 x i32> %a to <16 x double> ret <16 x double> %b } define <8 x float> @slto8f32(<8 x i64> %a) { ; NODQ-LABEL: slto8f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq ; ; DQ-LABEL: slto8f32: ; DQ: # %bb.0: ; DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; DQ-NEXT: retq %b = sitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } define <16 x float> @slto16f32(<16 x i64> %a) { ; NODQ-LABEL: slto16f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; NODQ-NEXT: retq ; ; DQ-LABEL: slto16f32: ; DQ: # %bb.0: ; DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; DQ-NEXT: vcvtqq2ps %zmm1, %ymm1 ; DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; DQ-NEXT: retq %b = sitofp <16 x i64> %a to <16 x float> ret <16 x float> %b } define <8 x double> @slto8f64(<8 x i64> %a) { ; NODQ-LABEL: slto8f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; NODQ-NEXT: retq ; ; DQ-LABEL: slto8f64: ; DQ: # %bb.0: ; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; DQ-NEXT: retq %b = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } define <16 x double> @slto16f64(<16 x i64> %a) { ; NODQ-LABEL: slto16f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1 ; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 ; NODQ-NEXT: retq ; ; DQ-LABEL: slto16f64: ; DQ: # %bb.0: ; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; DQ-NEXT: vcvtqq2pd %zmm1, %zmm1 ; DQ-NEXT: retq %b = sitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } define <8 x float> @ulto8f32(<8 x i64> %a) { ; NODQ-LABEL: ulto8f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq ; ; DQ-LABEL: ulto8f32: ; DQ: # %bb.0: ; DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; DQ-NEXT: retq %b = uitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } define <16 x float> @ulto16f32(<16 x i64> %a) { ; NODQ-LABEL: ulto16f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; NODQ-NEXT: retq ; ; DQ-LABEL: ulto16f32: ; DQ: # %bb.0: ; DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; DQ-NEXT: vcvtuqq2ps %zmm1, %ymm1 ; DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; DQ-NEXT: retq %b = uitofp <16 x i64> %a to <16 x float> ret <16 x float> %b } define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { ; KNL-LABEL: uito8f64_mask: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} ; KNL-NEXT: retq ; ; VLBW-LABEL: uito8f64_mask: ; VLBW: # %bb.0: ; VLBW-NEXT: kmovd %edi, %k1 ; VLBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} ; VLBW-NEXT: retq ; ; VLNOBW-LABEL: uito8f64_mask: ; VLNOBW: # %bb.0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} ; VLNOBW-NEXT: retq ; ; AVX512DQ-LABEL: uito8f64_mask: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: uito8f64_mask: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq %1 = bitcast i8 %c to <8 x i1> %2 = uitofp <8 x i32> %b to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a ret <8 x double> %3 } define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { ; KNL-LABEL: uito8f64_maskz: ; KNL: # %bb.0: ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; VLBW-LABEL: uito8f64_maskz: ; VLBW: # %bb.0: ; VLBW-NEXT: kmovd %edi, %k1 ; VLBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} ; VLBW-NEXT: retq ; ; VLNOBW-LABEL: uito8f64_maskz: ; VLNOBW: # %bb.0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} ; VLNOBW-NEXT: retq ; ; AVX512DQ-LABEL: uito8f64_maskz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: uito8f64_maskz: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq %1 = bitcast i8 %b to <8 x i1> %2 = uitofp <8 x i32> %a to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer ret <8 x double> %3 } define <4 x double> @uito4f64(<4 x i32> %a) nounwind { ; NOVL-LABEL: uito4f64: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 ; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 ; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; NOVL-NEXT: retq ; ; VL-LABEL: uito4f64: ; VL: # %bb.0: ; VL-NEXT: vcvtudq2pd %xmm0, %ymm0 ; VL-NEXT: retq %b = uitofp <4 x i32> %a to <4 x double> ret <4 x double> %b } define <16 x float> @uito16f32(<16 x i32> %a) nounwind { ; ALL-LABEL: uito16f32: ; ALL: # %bb.0: ; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %b = uitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } define <8 x double> @uito8f64(<8 x i32> %a) { ; ALL-LABEL: uito8f64: ; ALL: # %bb.0: ; ALL-NEXT: vcvtudq2pd %ymm0, %zmm0 ; ALL-NEXT: retq %b = uitofp <8 x i32> %a to <8 x double> ret <8 x double> %b } define <8 x float> @uito8f32(<8 x i32> %a) nounwind { ; NOVL-LABEL: uito8f32: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 ; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; NOVL-NEXT: retq ; ; VL-LABEL: uito8f32: ; VL: # %bb.0: ; VL-NEXT: vcvtudq2ps %ymm0, %ymm0 ; VL-NEXT: retq %b = uitofp <8 x i32> %a to <8 x float> ret <8 x float> %b } define <4 x float> @uito4f32(<4 x i32> %a) nounwind { ; NOVL-LABEL: uito4f32: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 ; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 ; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: uito4f32: ; VL: # %bb.0: ; VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; VL-NEXT: retq %b = uitofp <4 x i32> %a to <4 x float> ret <4 x float> %b } define i32 @fptosi(float %a) nounwind { ; ALL-LABEL: fptosi: ; ALL: # %bb.0: ; ALL-NEXT: vcvttss2si %xmm0, %eax ; ALL-NEXT: retq %b = fptosi float %a to i32 ret i32 %b } define i32 @fptoui(float %a) nounwind { ; ALL-LABEL: fptoui: ; ALL: # %bb.0: ; ALL-NEXT: vcvttss2usi %xmm0, %eax ; ALL-NEXT: retq %b = fptoui float %a to i32 ret i32 %b } define float @uitof32(i32 %a) nounwind { ; ALL-LABEL: uitof32: ; ALL: # %bb.0: ; ALL-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 ; ALL-NEXT: retq %b = uitofp i32 %a to float ret float %b } define double @uitof64(i32 %a) nounwind { ; ALL-LABEL: uitof64: ; ALL: # %bb.0: ; ALL-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 ; ALL-NEXT: retq %b = uitofp i32 %a to double ret double %b } define <16 x float> @sbto16f32(<16 x i32> %a) { ; NODQ-LABEL: sbto16f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 ; NODQ-NEXT: retq ; ; DQ-LABEL: sbto16f32: ; DQ: # %bb.0: ; DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ; DQ-NEXT: vpmovm2d %k0, %zmm0 ; DQ-NEXT: vcvtdq2ps %zmm0, %zmm0 ; DQ-NEXT: retq %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = sitofp <16 x i1> %mask to <16 x float> ret <16 x float> %1 } define <16 x float> @scto16f32(<16 x i8> %a) { ; ALL-LABEL: scto16f32: ; ALL: # %bb.0: ; ALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %1 = sitofp <16 x i8> %a to <16 x float> ret <16 x float> %1 } define <16 x float> @ssto16f32(<16 x i16> %a) { ; ALL-LABEL: ssto16f32: ; ALL: # %bb.0: ; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %1 = sitofp <16 x i16> %a to <16 x float> ret <16 x float> %1 } define <8 x double> @ssto16f64(<8 x i16> %a) { ; ALL-LABEL: ssto16f64: ; ALL: # %bb.0: ; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq %1 = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 } define <8 x double> @scto8f64(<8 x i8> %a) { ; ALL-LABEL: scto8f64: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; ALL-NEXT: vpslld $24, %ymm0, %ymm0 ; ALL-NEXT: vpsrad $24, %ymm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq %1 = sitofp <8 x i8> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @scto16f64(<16 x i8> %a) { ; ALL-LABEL: scto16f64: ; ALL: # %bb.0: ; ALL-NEXT: vpmovsxbd %xmm0, %zmm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; ALL-NEXT: retq %b = sitofp <16 x i8> %a to <16 x double> ret <16 x double> %b } define <16 x double> @sbto16f64(<16 x double> %a) { ; NOVLDQ-LABEL: sbto16f64: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; NOVLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 ; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2 ; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVLDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: sbto16f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0 ; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1 ; VLDQ-NEXT: vpmovm2d %k1, %ymm0 ; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; VLDQ-NEXT: vpmovm2d %k0, %ymm1 ; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto16f64: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; VLNODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 ; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2 ; VLNODQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm0 {%k2} {z} ; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm1 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; VLNODQ-NEXT: retq ; ; AVX512DQ-LABEL: sbto16f64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0 ; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 ; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: vcvtdq2pd %ymm1, %zmm1 ; AVX512DQ-NEXT: retq %cmpres = fcmp ogt <16 x double> %a, zeroinitializer %1 = sitofp <16 x i1> %cmpres to <16 x double> ret <16 x double> %1 } define <8 x double> @sbto8f64(<8 x double> %a) { ; NOVLDQ-LABEL: sbto8f64: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 ; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: sbto8f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; VLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 ; VLDQ-NEXT: vpmovm2d %k0, %ymm0 ; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto8f64: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 ; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; VLNODQ-NEXT: retq ; ; AVX512DQ-LABEL: sbto8f64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: retq %cmpres = fcmp ogt <8 x double> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x double> ret <8 x double> %1 } define <8 x float> @sbto8f32(<8 x float> %a) { ; NOVLDQ-LABEL: sbto8f32: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1 ; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; NOVLDQ-NEXT: retq ; ; VLDQ-LABEL: sbto8f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VLDQ-NEXT: vcmpltps %ymm0, %ymm1, %k0 ; VLDQ-NEXT: vpmovm2d %k0, %ymm0 ; VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto8f32: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vcmpltps %ymm0, %ymm1, %k1 ; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; VLNODQ-NEXT: retq ; ; AVX512DQ-LABEL: sbto8f32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512DQ-NEXT: retq %cmpres = fcmp ogt <8 x float> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x float> ret <8 x float> %1 } define <4 x float> @sbto4f32(<4 x float> %a) { ; NOVL-LABEL: sbto4f32: ; NOVL: # %bb.0: ; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0 ; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto4f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0 ; VLDQ-NEXT: vpmovm2d %k0, %xmm0 ; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto4f32: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1 ; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLNODQ-NEXT: retq %cmpres = fcmp ogt <4 x float> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x float> ret <4 x float> %1 } define <4 x double> @sbto4f64(<4 x double> %a) { ; NOVL-LABEL: sbto4f64: ; NOVL: # %bb.0: ; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; NOVL-NEXT: vpmovqd %zmm0, %ymm0 ; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto4f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; VLDQ-NEXT: vcmpltpd %ymm0, %ymm1, %k0 ; VLDQ-NEXT: vpmovm2d %k0, %xmm0 ; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto4f64: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0 ; VLNODQ-NEXT: retq %cmpres = fcmp ogt <4 x double> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x double> ret <4 x double> %1 } define <2 x float> @sbto2f32(<2 x float> %a) { ; NOVL-LABEL: sbto2f32: ; NOVL: # %bb.0: ; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0 ; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto2f32: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0 ; VLDQ-NEXT: vpmovm2d %k0, %xmm0 ; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto2f32: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1 ; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VLNODQ-NEXT: retq %cmpres = fcmp ogt <2 x float> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x float> ret <2 x float> %1 } define <2 x double> @sbto2f64(<2 x double> %a) { ; NOVL-LABEL: sbto2f64: ; NOVL: # %bb.0: ; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 ; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 ; NOVL-NEXT: retq ; ; VLDQ-LABEL: sbto2f64: ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 -; VLDQ-NEXT: vpmovm2q %k0, %xmm0 -; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto2f64: ; VLNODQ: # %bb.0: ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 ; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm0 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLNODQ-NEXT: retq %cmpres = fcmp ogt <2 x double> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x double> ret <2 x double> %1 } define <16 x float> @ucto16f32(<16 x i8> %a) { ; ALL-LABEL: ucto16f32: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %b = uitofp <16 x i8> %a to <16 x float> ret <16 x float>%b } define <8 x double> @ucto8f64(<8 x i8> %a) { ; ALL-LABEL: ucto8f64: ; ALL: # %bb.0: ; ALL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq %b = uitofp <8 x i8> %a to <8 x double> ret <8 x double> %b } define <16 x float> @swto16f32(<16 x i16> %a) { ; ALL-LABEL: swto16f32: ; ALL: # %bb.0: ; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %b = sitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } define <8 x double> @swto8f64(<8 x i16> %a) { ; ALL-LABEL: swto8f64: ; ALL: # %bb.0: ; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq %b = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %b } define <16 x double> @swto16f64(<16 x i16> %a) { ; ALL-LABEL: swto16f64: ; ALL: # %bb.0: ; ALL-NEXT: vpmovsxwd %ymm0, %zmm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; ALL-NEXT: retq %b = sitofp <16 x i16> %a to <16 x double> ret <16 x double> %b } define <16 x double> @ucto16f64(<16 x i8> %a) { ; ALL-LABEL: ucto16f64: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; ALL-NEXT: retq %b = uitofp <16 x i8> %a to <16 x double> ret <16 x double> %b } define <16 x float> @uwto16f32(<16 x i16> %a) { ; ALL-LABEL: uwto16f32: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %b = uitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } define <8 x double> @uwto8f64(<8 x i16> %a) { ; ALL-LABEL: uwto8f64: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq %b = uitofp <8 x i16> %a to <8 x double> ret <8 x double> %b } define <16 x double> @uwto16f64(<16 x i16> %a) { ; ALL-LABEL: uwto16f64: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; ALL-NEXT: retq %b = uitofp <16 x i16> %a to <16 x double> ret <16 x double> %b } define <16 x float> @sito16f32(<16 x i32> %a) { ; ALL-LABEL: sito16f32: ; ALL: # %bb.0: ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %b = sitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } define <16 x double> @sito16f64(<16 x i32> %a) { ; ALL-LABEL: sito16f64: ; ALL: # %bb.0: ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm2 ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm1 ; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %b = sitofp <16 x i32> %a to <16 x double> ret <16 x double> %b } define <16 x float> @usto16f32(<16 x i16> %a) { ; ALL-LABEL: usto16f32: ; ALL: # %bb.0: ; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %b = uitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } define <16 x float> @ubto16f32(<16 x i32> %a) { ; ALL-LABEL: ubto16f32: ; ALL: # %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x float> ret <16 x float> %1 } define <16 x double> @ubto16f64(<16 x i32> %a) { ; NOVL-LABEL: ubto16f64: ; NOVL: # %bb.0: ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ; NOVL-NEXT: movl {{.*}}(%rip), %eax ; NOVL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} ; NOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVL-NEXT: kshiftrw $8, %k1, %k1 ; NOVL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} ; NOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; NOVL-NEXT: retq ; ; VL-LABEL: ubto16f64: ; VL: # %bb.0: ; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ; VL-NEXT: movl {{.*}}(%rip), %eax ; VL-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} ; VL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; VL-NEXT: kshiftrw $8, %k1, %k1 ; VL-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} ; VL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; VL-NEXT: retq %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x double> ret <16 x double> %1 } define <8 x float> @ubto8f32(<8 x i32> %a) { ; NOVL-LABEL: ubto8f32: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; NOVL-NEXT: vcvtdq2ps %ymm0, %ymm0 ; NOVL-NEXT: retq ; ; VL-LABEL: ubto8f32: ; VL: # %bb.0: ; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 ; VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} ; VL-NEXT: vcvtdq2ps %ymm0, %ymm0 ; VL-NEXT: retq %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x float> ret <8 x float> %1 } define <8 x double> @ubto8f64(<8 x i32> %a) { ; NOVL-LABEL: ubto8f64: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; NOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; NOVL-NEXT: retq ; ; VL-LABEL: ubto8f64: ; VL: # %bb.0: ; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 ; VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} ; VL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; VL-NEXT: retq %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x double> ret <8 x double> %1 } define <4 x float> @ubto4f32(<4 x i32> %a) { ; NOVL-LABEL: ubto4f32: ; NOVL: # %bb.0: ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NOVL-NEXT: retq ; ; VL-LABEL: ubto4f32: ; VL: # %bb.0: ; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 ; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; VL-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VL-NEXT: retq %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x float> ret <4 x float> %1 } define <4 x double> @ubto4f64(<4 x i32> %a) { ; NOVL-LABEL: ubto4f64: ; NOVL: # %bb.0: ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NOVL-NEXT: vpsrld $31, %xmm0, %xmm0 ; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; NOVL-NEXT: retq ; ; VL-LABEL: ubto4f64: ; VL: # %bb.0: ; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 ; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; VL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; VL-NEXT: retq %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x double> ret <4 x double> %1 } define <2 x float> @ubto2f32(<2 x i32> %a) { ; NOVL-LABEL: ubto2f32: ; NOVL: # %bb.0: ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NOVL-NEXT: vpextrb $8, %xmm0, %eax ; NOVL-NEXT: andl $1, %eax ; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 ; NOVL-NEXT: vpextrb $0, %xmm0, %eax ; NOVL-NEXT: andl $1, %eax ; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 ; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; NOVL-NEXT: retq ; ; VL-LABEL: ubto2f32: ; VL: # %bb.0: ; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; VL-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 ; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; VL-NEXT: vcvtdq2ps %xmm0, %xmm0 ; VL-NEXT: retq %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 } define <2 x double> @ubto2f64(<2 x i32> %a) { ; NOVL-LABEL: ubto2f64: ; NOVL: # %bb.0: ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; NOVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 +; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; -; VLDQ-LABEL: ubto2f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLDQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} -; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: ubto2f64: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLNODQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm0 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; VLNODQ-NEXT: retq +; VL-LABEL: ubto2f64: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; VL-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; VL-NEXT: retq %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 } Index: vendor/llvm/dist/test/CodeGen/X86/avx512-schedule.ll =================================================================== --- vendor/llvm/dist/test/CodeGen/X86/avx512-schedule.ll (revision 327151) +++ vendor/llvm/dist/test/CodeGen/X86/avx512-schedule.ll (revision 327152) @@ -1,8860 +1,8860 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=GENERIC ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX ; This test is an assembly of avx512 instructions to check their scheduling define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) { ; GENERIC-LABEL: addpd512: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vaddpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: addpd512: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vaddpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %add.i = fadd <8 x double> %x, %y ret <8 x double> %add.i } define <8 x double> @addpd512fold(<8 x double> %y) { ; GENERIC-LABEL: addpd512fold: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: addpd512fold: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %add.i = fadd <8 x double> %y, ret <8 x double> %add.i } define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) { ; GENERIC-LABEL: addps512: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vaddps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: addps512: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vaddps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %add.i = fadd <16 x float> %x, %y ret <16 x float> %add.i } define <16 x float> @addps512fold(<16 x float> %y) { ; GENERIC-LABEL: addps512fold: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: addps512fold: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %add.i = fadd <16 x float> %y, ret <16 x float> %add.i } define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) { ; GENERIC-LABEL: subpd512: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vsubpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: subpd512: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vsubpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %sub.i = fsub <8 x double> %x, %y ret <8 x double> %sub.i } define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) { ; GENERIC-LABEL: subpd512fold: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vsubpd (%rdi), %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: subpd512fold: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vsubpd (%rdi), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp2 = load <8 x double>, <8 x double>* %x, align 8 %sub.i = fsub <8 x double> %y, %tmp2 ret <8 x double> %sub.i } define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) { ; GENERIC-LABEL: subps512: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vsubps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: subps512: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vsubps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %sub.i = fsub <16 x float> %x, %y ret <16 x float> %sub.i } define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) { ; GENERIC-LABEL: subps512fold: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vsubps (%rdi), %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: subps512fold: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vsubps (%rdi), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp2 = load <16 x float>, <16 x float>* %x, align 4 %sub.i = fsub <16 x float> %y, %tmp2 ret <16 x float> %sub.i } define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) { ; GENERIC-LABEL: imulq512: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmullq %zmm0, %zmm1, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: imulq512: ; SKX: # %bb.0: ; SKX-NEXT: vpmullq %zmm0, %zmm1, %zmm0 # sched: [12:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %z = mul <8 x i64>%x, %y ret <8 x i64>%z } define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) { ; GENERIC-LABEL: imulq256: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmullq %ymm0, %ymm1, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: imulq256: ; SKX: # %bb.0: ; SKX-NEXT: vpmullq %ymm0, %ymm1, %ymm0 # sched: [12:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %z = mul <4 x i64>%x, %y ret <4 x i64>%z } define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { ; GENERIC-LABEL: imulq128: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmullq %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: imulq128: ; SKX: # %bb.0: ; SKX-NEXT: vpmullq %xmm0, %xmm1, %xmm0 # sched: [12:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %z = mul <2 x i64>%x, %y ret <2 x i64>%z } define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) { ; GENERIC-LABEL: mulpd512: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vmulpd %zmm0, %zmm1, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mulpd512: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vmulpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %mul.i = fmul <8 x double> %x, %y ret <8 x double> %mul.i } define <8 x double> @mulpd512fold(<8 x double> %y) { ; GENERIC-LABEL: mulpd512fold: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [9:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mulpd512fold: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %mul.i = fmul <8 x double> %y, ret <8 x double> %mul.i } define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) { ; GENERIC-LABEL: mulps512: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mulps512: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %mul.i = fmul <16 x float> %x, %y ret <16 x float> %mul.i } define <16 x float> @mulps512fold(<16 x float> %y) { ; GENERIC-LABEL: mulps512fold: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [9:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mulps512fold: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %mul.i = fmul <16 x float> %y, ret <16 x float> %mul.i } define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) { ; GENERIC-LABEL: divpd512: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vdivpd %zmm0, %zmm1, %zmm0 # sched: [24:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: divpd512: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vdivpd %zmm0, %zmm1, %zmm0 # sched: [23:2.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %div.i = fdiv <8 x double> %x, %y ret <8 x double> %div.i } define <8 x double> @divpd512fold(<8 x double> %y) { ; GENERIC-LABEL: divpd512fold: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [28:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: divpd512fold: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [30:2.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %div.i = fdiv <8 x double> %y, ret <8 x double> %div.i } define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) { ; GENERIC-LABEL: divps512: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [24:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: divps512: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [23:2.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %div.i = fdiv <16 x float> %x, %y ret <16 x float> %div.i } define <16 x float> @divps512fold(<16 x float> %y) { ; GENERIC-LABEL: divps512fold: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 # sched: [28:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: divps512fold: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 # sched: [24:2.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %div.i = fdiv <16 x float> %y, ret <16 x float> %div.i } define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { ; GENERIC-LABEL: vpaddq_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpaddq %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddq_test: ; SKX: # %bb.0: ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = add <8 x i64> %i, %j ret <8 x i64> %x } define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind { ; GENERIC-LABEL: vpaddq_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpaddq (%rdi), %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddq_fold_test: ; SKX: # %bb.0: ; SKX-NEXT: vpaddq (%rdi), %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %tmp = load <8 x i64>, <8 x i64>* %j, align 4 %x = add <8 x i64> %i, %tmp ret <8 x i64> %x } define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind { ; GENERIC-LABEL: vpaddq_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddq_broadcast_test: ; SKX: # %bb.0: ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %x = add <8 x i64> %i, ret <8 x i64> %x } define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind { ; GENERIC-LABEL: vpaddq_broadcast2_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddq_broadcast2_test: ; SKX: # %bb.0: ; SKX-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %tmp = load i64, i64* %j %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0 %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1 %j.2 = insertelement <8 x i64> %j.1, i64 %tmp, i32 2 %j.3 = insertelement <8 x i64> %j.2, i64 %tmp, i32 3 %j.4 = insertelement <8 x i64> %j.3, i64 %tmp, i32 4 %j.5 = insertelement <8 x i64> %j.4, i64 %tmp, i32 5 %j.6 = insertelement <8 x i64> %j.5, i64 %tmp, i32 6 %j.7 = insertelement <8 x i64> %j.6, i64 %tmp, i32 7 %x = add <8 x i64> %i, %j.7 ret <8 x i64> %x } define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { ; GENERIC-LABEL: vpaddd_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_test: ; SKX: # %bb.0: ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = add <16 x i32> %i, %j ret <16 x i32> %x } define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind { ; GENERIC-LABEL: vpaddd_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_fold_test: ; SKX: # %bb.0: ; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %tmp = load <16 x i32>, <16 x i32>* %j, align 4 %x = add <16 x i32> %i, %tmp ret <16 x i32> %x } define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind { ; GENERIC-LABEL: vpaddd_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_broadcast_test: ; SKX: # %bb.0: ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %x = add <16 x i32> %i, ret <16 x i32> %x } define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_mask_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_mask_test: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i ret <16 x i32> %r } define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_maskz_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_maskz_test: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %r } define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_mask_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_mask_fold_test: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %j = load <16 x i32>, <16 x i32>* %j.ptr %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i ret <16 x i32> %r } define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_mask_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_mask_broadcast_test: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i ret <16 x i32> %r } define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_maskz_fold_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_maskz_fold_test: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %j = load <16 x i32>, <16 x i32>* %j.ptr %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %r } define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: vpaddd_maskz_broadcast_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpaddd_maskz_broadcast_test: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %r } define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { ; GENERIC-LABEL: vpsubq_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsubq %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpsubq_test: ; SKX: # %bb.0: ; SKX-NEXT: vpsubq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = sub <8 x i64> %i, %j ret <8 x i64> %x } define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { ; GENERIC-LABEL: vpsubd_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsubd %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpsubd_test: ; SKX: # %bb.0: ; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = sub <16 x i32> %i, %j ret <16 x i32> %x } define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) { ; GENERIC-LABEL: vpmulld_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmulld %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpmulld_test: ; SKX: # %bb.0: ; SKX-NEXT: vpmulld %zmm1, %zmm0, %zmm0 # sched: [8:0.67] ; SKX-NEXT: retq # sched: [7:1.00] %x = mul <16 x i32> %i, %j ret <16 x i32> %x } declare float @sqrtf(float) readnone define float @sqrtA(float %a) nounwind uwtable readnone ssp { ; GENERIC-LABEL: sqrtA: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sqrtA: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %conv1 = tail call float @sqrtf(float %a) nounwind readnone ret float %conv1 } declare double @sqrt(double) readnone define double @sqrtB(double %a) nounwind uwtable readnone ssp { ; GENERIC-LABEL: sqrtB: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sqrtB: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [18:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %call = tail call double @sqrt(double %a) nounwind readnone ret double %call } declare float @llvm.sqrt.f32(float) define float @sqrtC(float %a) nounwind { ; GENERIC-LABEL: sqrtC: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sqrtC: ; SKX: # %bb.0: ; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = call float @llvm.sqrt.f32(float %a) ret float %b } declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) define <16 x float> @sqrtD(<16 x float> %a) nounwind { ; GENERIC-LABEL: sqrtD: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vsqrtps %zmm0, %zmm0 # sched: [14:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sqrtD: ; SKX: # %bb.0: ; SKX-NEXT: vsqrtps %zmm0, %zmm0 # sched: [19:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a) ret <16 x float> %b } declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) define <8 x double> @sqrtE(<8 x double> %a) nounwind { ; GENERIC-LABEL: sqrtE: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vsqrtpd %zmm0, %zmm0 # sched: [14:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sqrtE: ; SKX: # %bb.0: ; SKX-NEXT: vsqrtpd %zmm0, %zmm0 # sched: [31:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a) ret <8 x double> %b } define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind { ; GENERIC-LABEL: fadd_broadcast: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: fadd_broadcast: ; SKX: # %bb.0: ; SKX-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %b = fadd <16 x float> %a, ret <16 x float> %b } define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind { ; GENERIC-LABEL: addq_broadcast: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: addq_broadcast: ; SKX: # %bb.0: ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %b = add <8 x i64> %a, ret <8 x i64> %b } define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { ; GENERIC-LABEL: orq_broadcast: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: orq_broadcast: ; SKX: # %bb.0: ; SKX-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %b = or <8 x i64> %a, ret <8 x i64> %b } define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { ; GENERIC-LABEL: andd512fold: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vandps (%rdi), %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: andd512fold: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vandps (%rdi), %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %a = load <16 x i32>, <16 x i32>* %x, align 4 %b = and <16 x i32> %y, %a ret <16 x i32> %b } define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { ; GENERIC-LABEL: andqbrst: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: andqbrst: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %a = load i64, i64* %ap, align 8 %b = insertelement <8 x i64> undef, i64 %a, i32 0 %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer %d = and <8 x i64> %p1, %c ret <8 x i64>%d } define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, ; GENERIC-LABEL: test_mask_vaddps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vaddps: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] <16 x float> %j, <16 x i32> %mask1) nounwind readnone { %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = fadd <16 x float> %i, %j %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst ret <16 x float> %r } define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vmulps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vmulps: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = fmul <16 x float> %i, %j %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst ret <16 x float> %r } define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vminps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vminps: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %cmp_res = fcmp olt <16 x float> %i, %j %min = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j %r = select <16 x i1> %mask, <16 x float> %min, <16 x float> %dst ret <16 x float> %r } define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vminpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vminpd: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %cmp_res = fcmp olt <8 x double> %i, %j %min = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j %r = select <8 x i1> %mask, <8 x double> %min, <8 x double> %dst ret <8 x double> %r } define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vmaxps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vmaxps: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %cmp_res = fcmp ogt <16 x float> %i, %j %max = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j %r = select <16 x i1> %mask, <16 x float> %max, <16 x float> %dst ret <16 x float> %r } define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vmaxpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vmaxpd: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %cmp_res = fcmp ogt <8 x double> %i, %j %max = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j %r = select <8 x i1> %mask, <8 x double> %max, <8 x double> %dst ret <8 x double> %r } define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vsubps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vsubps: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = fsub <16 x float> %i, %j %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst ret <16 x float> %r } define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vdivps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [24:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vdivps: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [23:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = fdiv <16 x float> %i, %j %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst ret <16 x float> %r } define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone { ; GENERIC-LABEL: test_mask_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_vaddpd: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %x = fadd <8 x double> %i, %j %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst ret <8 x double> %r } define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone { ; GENERIC-LABEL: test_maskz_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_maskz_vaddpd: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %x = fadd <8 x double> %i, %j %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer ret <8 x double> %r } define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind { ; GENERIC-LABEL: test_mask_fold_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_fold_vaddpd: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load <8 x double>, <8 x double>* %j, align 8 %x = fadd <8 x double> %i, %tmp %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst ret <8 x double> %r } define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind { ; GENERIC-LABEL: test_maskz_fold_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_maskz_fold_vaddpd: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load <8 x double>, <8 x double>* %j, align 8 %x = fadd <8 x double> %i, %tmp %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer ret <8 x double> %r } define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind { ; GENERIC-LABEL: test_broadcast_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_broadcast_vaddpd: ; SKX: # %bb.0: ; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %tmp = load double, double* %j %b = insertelement <8 x double> undef, double %tmp, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer %x = fadd <8 x double> %c, %i ret <8 x double> %x } define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, double* %j, <8 x i64> %mask1) nounwind { ; GENERIC-LABEL: test_mask_broadcast_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mask_broadcast_vaddpd: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load double, double* %j %b = insertelement <8 x double> undef, double %tmp, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer %x = fadd <8 x double> %c, %i %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %i ret <8 x double> %r } define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, ; GENERIC-LABEL: test_maskz_broadcast_vaddpd: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_maskz_broadcast_vaddpd: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] <8 x i64> %mask1) nounwind { %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load double, double* %j %b = insertelement <8 x double> undef, double %tmp, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer %x = fadd <8 x double> %c, %i %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer ret <8 x double> %r } define <16 x float> @test_fxor(<16 x float> %a) { ; GENERIC-LABEL: test_fxor: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_fxor: ; SKX: # %bb.0: ; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %res = fsub <16 x float> , %a ret <16 x float>%res } define <8 x float> @test_fxor_8f32(<8 x float> %a) { ; GENERIC-LABEL: test_fxor_8f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_fxor_8f32: ; SKX: # %bb.0: ; SKX-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %res = fsub <8 x float> , %a ret <8 x float>%res } define <8 x double> @fabs_v8f64(<8 x double> %p) ; GENERIC-LABEL: fabs_v8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: fabs_v8f64: ; SKX: # %bb.0: ; SKX-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] { %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p) ret <8 x double> %t } declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p) define <16 x float> @fabs_v16f32(<16 x float> %p) ; GENERIC-LABEL: fabs_v16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: fabs_v16f32: ; SKX: # %bb.0: ; SKX-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] { %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p) ret <16 x float> %t } declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p) define double @test1(double %a, double %b) nounwind { ; GENERIC-LABEL: test1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] ; GENERIC-NEXT: jne .LBB64_1 # sched: [1:1.00] ; GENERIC-NEXT: jnp .LBB64_2 # sched: [1:1.00] ; GENERIC-NEXT: .LBB64_1: # %l1 ; GENERIC-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB64_2: # %l2 ; GENERIC-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test1: ; SKX: # %bb.0: ; SKX-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] ; SKX-NEXT: jne .LBB64_1 # sched: [1:0.50] ; SKX-NEXT: jnp .LBB64_2 # sched: [1:0.50] ; SKX-NEXT: .LBB64_1: # %l1 ; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; SKX-NEXT: .LBB64_2: # %l2 ; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %tobool = fcmp une double %a, %b br i1 %tobool, label %l1, label %l2 l1: %c = fsub double %a, %b ret double %c l2: %c1 = fadd double %a, %b ret double %c1 } define float @test2(float %a, float %b) nounwind { ; GENERIC-LABEL: test2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vucomiss %xmm0, %xmm1 # sched: [2:1.00] ; GENERIC-NEXT: jbe .LBB65_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %l1 ; GENERIC-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB65_2: # %l2 ; GENERIC-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2: ; SKX: # %bb.0: ; SKX-NEXT: vucomiss %xmm0, %xmm1 # sched: [2:1.00] ; SKX-NEXT: jbe .LBB65_2 # sched: [1:0.50] ; SKX-NEXT: # %bb.1: # %l1 ; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; SKX-NEXT: .LBB65_2: # %l2 ; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %tobool = fcmp olt float %a, %b br i1 %tobool, label %l1, label %l2 l1: %c = fsub float %a, %b ret float %c l2: %c1 = fadd float %a, %b ret float %c1 } define i32 @test3(float %a, float %b) { ; GENERIC-LABEL: test3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test3: ; SKX: # %bb.0: ; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: movzbl %al, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %cmp10.i = fcmp oeq float %a, %b %conv11.i = zext i1 %cmp10.i to i32 ret i32 %conv11.i } define float @test5(float %p) #0 { ; GENERIC-LABEL: test5: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vucomiss %xmm1, %xmm0 # sched: [2:1.00] ; GENERIC-NEXT: jne .LBB67_1 # sched: [1:1.00] ; GENERIC-NEXT: jp .LBB67_1 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.2: # %return ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB67_1: # %if.end ; GENERIC-NEXT: seta %al # sched: [2:1.00] ; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33] ; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test5: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vucomiss %xmm1, %xmm0 # sched: [2:1.00] ; SKX-NEXT: jne .LBB67_1 # sched: [1:0.50] ; SKX-NEXT: jp .LBB67_1 # sched: [1:0.50] ; SKX-NEXT: # %bb.2: # %return ; SKX-NEXT: retq # sched: [7:1.00] ; SKX-NEXT: .LBB67_1: # %if.end ; SKX-NEXT: seta %al # sched: [2:1.00] ; SKX-NEXT: movzbl %al, %eax # sched: [1:0.25] ; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %cmp = fcmp oeq float %p, 0.000000e+00 br i1 %cmp, label %return, label %if.end if.end: ; preds = %entry %cmp1 = fcmp ogt float %p, 0.000000e+00 %cond = select i1 %cmp1, float 1.000000e+00, float -1.000000e+00 br label %return return: ; preds = %if.end, %entry %retval.0 = phi float [ %cond, %if.end ], [ %p, %entry ] ret float %retval.0 } define i32 @test6(i32 %a, i32 %b) { ; GENERIC-LABEL: test6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] ; GENERIC-NEXT: sete %al # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test6: ; SKX: # %bb.0: ; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25] ; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] ; SKX-NEXT: sete %al # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %cmp = icmp eq i32 %a, %b %res = zext i1 %cmp to i32 ret i32 %res } define i32 @test7(double %x, double %y) #2 { ; GENERIC-LABEL: test7: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] ; GENERIC-NEXT: setne %al # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test7: ; SKX: # %bb.0: # %entry ; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25] ; SKX-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] ; SKX-NEXT: setne %al # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = fcmp one double %x, %y %or = zext i1 %0 to i32 ret i32 %or } define i32 @test8(i32 %a1, i32 %a2, i32 %a3) { ; GENERIC-LABEL: test8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 ; GENERIC-NEXT: # sched: [1:0.33] ; GENERIC-NEXT: testl %edx, %edx # sched: [1:0.33] ; GENERIC-NEXT: movl $1, %eax # sched: [1:0.33] ; GENERIC-NEXT: cmovel %eax, %edx # sched: [2:0.67] ; GENERIC-NEXT: notl %edi # sched: [1:0.33] ; GENERIC-NEXT: orl %edi, %esi # sched: [1:0.33] ; GENERIC-NEXT: cmovnel %edx, %eax # sched: [2:0.67] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test8: ; SKX: # %bb.0: ; SKX-NEXT: notl %edi # sched: [1:0.25] ; SKX-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 ; SKX-NEXT: # sched: [1:0.25] ; SKX-NEXT: testl %edx, %edx # sched: [1:0.25] ; SKX-NEXT: movl $1, %eax # sched: [1:0.25] ; SKX-NEXT: cmovel %eax, %edx # sched: [1:0.50] ; SKX-NEXT: orl %edi, %esi # sched: [1:0.25] ; SKX-NEXT: cmovnel %edx, %eax # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %tmp1 = icmp eq i32 %a1, -1 %tmp2 = icmp eq i32 %a2, -2147483648 %tmp3 = and i1 %tmp1, %tmp2 %tmp4 = icmp eq i32 %a3, 0 %tmp5 = or i1 %tmp3, %tmp4 %res = select i1 %tmp5, i32 1, i32 %a3 ret i32 %res } define i32 @test9(i64 %a) { ; GENERIC-LABEL: test9: ; GENERIC: # %bb.0: ; GENERIC-NEXT: testb $1, %dil # sched: [1:0.33] ; GENERIC-NEXT: jne .LBB71_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %A ; GENERIC-NEXT: movl $6, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB71_2: # %B ; GENERIC-NEXT: movl $7, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test9: ; SKX: # %bb.0: ; SKX-NEXT: testb $1, %dil # sched: [1:0.25] ; SKX-NEXT: jne .LBB71_2 # sched: [1:0.50] ; SKX-NEXT: # %bb.1: # %A ; SKX-NEXT: movl $6, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] ; SKX-NEXT: .LBB71_2: # %B ; SKX-NEXT: movl $7, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %b = and i64 %a, 1 %cmp10.i = icmp eq i64 %b, 0 br i1 %cmp10.i, label %A, label %B A: ret i32 6 B: ret i32 7 } define i32 @test10(i64 %b, i64 %c, i1 %d) { ; GENERIC-LABEL: test10: ; GENERIC: # %bb.0: ; GENERIC-NEXT: movl %edx, %eax # sched: [1:0.33] ; GENERIC-NEXT: andb $1, %al # sched: [1:0.33] ; GENERIC-NEXT: cmpq %rsi, %rdi # sched: [1:0.33] ; GENERIC-NEXT: sete %cl # sched: [1:0.50] ; GENERIC-NEXT: orb %dl, %cl # sched: [1:0.33] ; GENERIC-NEXT: andb $1, %cl # sched: [1:0.33] ; GENERIC-NEXT: cmpb %cl, %al # sched: [1:0.33] ; GENERIC-NEXT: je .LBB72_1 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.2: # %if.end.i ; GENERIC-NEXT: movl $6, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB72_1: # %if.then.i ; GENERIC-NEXT: movl $5, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test10: ; SKX: # %bb.0: ; SKX-NEXT: movl %edx, %eax # sched: [1:0.25] ; SKX-NEXT: andb $1, %al # sched: [1:0.25] ; SKX-NEXT: cmpq %rsi, %rdi # sched: [1:0.25] ; SKX-NEXT: sete %cl # sched: [1:0.50] ; SKX-NEXT: orb %dl, %cl # sched: [1:0.25] ; SKX-NEXT: andb $1, %cl # sched: [1:0.25] ; SKX-NEXT: cmpb %cl, %al # sched: [1:0.25] ; SKX-NEXT: je .LBB72_1 # sched: [1:0.50] ; SKX-NEXT: # %bb.2: # %if.end.i ; SKX-NEXT: movl $6, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] ; SKX-NEXT: .LBB72_1: # %if.then.i ; SKX-NEXT: movl $5, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %cmp8.i = icmp eq i64 %b, %c %or1 = or i1 %d, %cmp8.i %xor1 = xor i1 %d, %or1 br i1 %xor1, label %if.end.i, label %if.then.i if.then.i: ret i32 5 if.end.i: ret i32 6 } define <16 x float> @sitof32(<16 x i32> %a) nounwind { ; GENERIC-LABEL: sitof32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sitof32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } define <8 x double> @sltof864(<8 x i64> %a) { ; GENERIC-LABEL: sltof864: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sltof864: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } define <4 x double> @slto4f64(<4 x i64> %a) { ; GENERIC-LABEL: slto4f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2pd %ymm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: slto4f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2pd %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <4 x i64> %a to <4 x double> ret <4 x double> %b } define <2 x double> @slto2f64(<2 x i64> %a) { ; GENERIC-LABEL: slto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: slto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <2 x i64> %a to <2 x double> ret <2 x double> %b } define <2 x float> @sltof2f32(<2 x i64> %a) { ; GENERIC-LABEL: sltof2f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2ps %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sltof2f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2ps %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <2 x i64> %a to <2 x float> ret <2 x float>%b } define <4 x float> @slto4f32_mem(<4 x i64>* %a) { ; GENERIC-LABEL: slto4f32_mem: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2psy (%rdi), %xmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: slto4f32_mem: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2psy (%rdi), %xmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a1 = load <4 x i64>, <4 x i64>* %a, align 8 %b = sitofp <4 x i64> %a1 to <4 x float> ret <4 x float>%b } define <4 x i64> @f64to4sl(<4 x double> %a) { ; GENERIC-LABEL: f64to4sl: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttpd2qq %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to4sl: ; SKX: # %bb.0: ; SKX-NEXT: vcvttpd2qq %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <4 x double> %a to <4 x i64> ret <4 x i64> %b } define <4 x i64> @f32to4sl(<4 x float> %a) { ; GENERIC-LABEL: f32to4sl: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttps2qq %xmm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32to4sl: ; SKX: # %bb.0: ; SKX-NEXT: vcvttps2qq %xmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <4 x float> %a to <4 x i64> ret <4 x i64> %b } define <4 x float> @slto4f32(<4 x i64> %a) { ; GENERIC-LABEL: slto4f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2ps %ymm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: slto4f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2ps %ymm0, %xmm0 # sched: [7:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } define <4 x float> @ulto4f32(<4 x i64> %a) { ; GENERIC-LABEL: ulto4f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtuqq2ps %ymm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ulto4f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtuqq2ps %ymm0, %xmm0 # sched: [7:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } define <8 x double> @ulto8f64(<8 x i64> %a) { ; GENERIC-LABEL: ulto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ulto8f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } define <16 x double> @ulto16f64(<16 x i64> %a) { ; GENERIC-LABEL: ulto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vcvtuqq2pd %zmm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ulto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vcvtuqq2pd %zmm1, %zmm1 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } define <16 x i32> @f64to16si(<16 x float> %a) nounwind { ; GENERIC-LABEL: f64to16si: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to16si: ; SKX: # %bb.0: ; SKX-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <16 x float> %a to <16 x i32> ret <16 x i32> %b } define <16 x i32> @f32to16ui(<16 x float> %a) nounwind { ; GENERIC-LABEL: f32to16ui: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32to16ui: ; SKX: # %bb.0: ; SKX-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <16 x float> %a to <16 x i32> ret <16 x i32> %b } define <16 x i8> @f32to16uc(<16 x float> %f) { ; GENERIC-LABEL: f32to16uc: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovdb %zmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32to16uc: ; SKX: # %bb.0: ; SKX-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vpmovdb %zmm0, %xmm0 # sched: [4:2.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = fptoui <16 x float> %f to <16 x i8> ret <16 x i8> %res } define <16 x i16> @f32to16us(<16 x float> %f) { ; GENERIC-LABEL: f32to16us: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovdw %zmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32to16us: ; SKX: # %bb.0: ; SKX-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vpmovdw %zmm0, %ymm0 # sched: [4:2.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = fptoui <16 x float> %f to <16 x i16> ret <16 x i16> %res } define <8 x i32> @f32to8ui(<8 x float> %a) nounwind { ; GENERIC-LABEL: f32to8ui: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttps2udq %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32to8ui: ; SKX: # %bb.0: ; SKX-NEXT: vcvttps2udq %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <8 x float> %a to <8 x i32> ret <8 x i32> %b } define <4 x i32> @f32to4ui(<4 x float> %a) nounwind { ; GENERIC-LABEL: f32to4ui: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttps2udq %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32to4ui: ; SKX: # %bb.0: ; SKX-NEXT: vcvttps2udq %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <4 x float> %a to <4 x i32> ret <4 x i32> %b } define <8 x i32> @f64to8ui(<8 x double> %a) nounwind { ; GENERIC-LABEL: f64to8ui: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttpd2udq %zmm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to8ui: ; SKX: # %bb.0: ; SKX-NEXT: vcvttpd2udq %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <8 x double> %a to <8 x i32> ret <8 x i32> %b } define <8 x i16> @f64to8us(<8 x double> %f) { ; GENERIC-LABEL: f64to8us: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to8us: ; SKX: # %bb.0: ; SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: vpmovdw %ymm0, %xmm0 # sched: [4:2.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = fptoui <8 x double> %f to <8 x i16> ret <8 x i16> %res } define <8 x i8> @f64to8uc(<8 x double> %f) { ; GENERIC-LABEL: f64to8uc: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to8uc: ; SKX: # %bb.0: ; SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: vpmovdw %ymm0, %xmm0 # sched: [4:2.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = fptoui <8 x double> %f to <8 x i8> ret <8 x i8> %res } define <4 x i32> @f64to4ui(<4 x double> %a) nounwind { ; GENERIC-LABEL: f64to4ui: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttpd2udq %ymm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to4ui: ; SKX: # %bb.0: ; SKX-NEXT: vcvttpd2udq %ymm0, %xmm0 # sched: [7:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <4 x double> %a to <4 x i32> ret <4 x i32> %b } define <8 x double> @sito8f64(<8 x i32> %a) { ; GENERIC-LABEL: sito8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sito8f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i32> %a to <8 x double> ret <8 x double> %b } define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { ; GENERIC-LABEL: i32to8f64_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: i32to8f64_mask: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] ; VLNOBW-LABEL: i32to8f64_mask: ; VLNOBW: # %bb.0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} ; VLNOBW-NEXT: ret{{[l|q]}} %1 = bitcast i8 %c to <8 x i1> %2 = sitofp <8 x i32> %b to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a ret <8 x double> %3 } define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { ; GENERIC-LABEL: sito8f64_maskz: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sito8f64_maskz: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] ; VLNOBW-LABEL: sito8f64_maskz: ; VLNOBW: # %bb.0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} ; VLNOBW-NEXT: ret{{[l|q]}} %1 = bitcast i8 %b to <8 x i1> %2 = sitofp <8 x i32> %a to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer ret <8 x double> %3 } define <8 x i32> @f64to8si(<8 x double> %a) { ; GENERIC-LABEL: f64to8si: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to8si: ; SKX: # %bb.0: ; SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <8 x double> %a to <8 x i32> ret <8 x i32> %b } define <4 x i32> @f64to4si(<4 x double> %a) { ; GENERIC-LABEL: f64to4si: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to4si: ; SKX: # %bb.0: ; SKX-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [7:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <4 x double> %a to <4 x i32> ret <4 x i32> %b } define <16 x float> @f64to16f32(<16 x double> %b) nounwind { ; GENERIC-LABEL: f64to16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: vcvtpd2ps %zmm1, %ymm1 # sched: [3:1.00] ; GENERIC-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to16f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: vcvtpd2ps %zmm1, %ymm1 # sched: [7:1.00] ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = fptrunc <16 x double> %b to <16 x float> ret <16 x float> %a } define <4 x float> @f64to4f32(<4 x double> %b) { ; GENERIC-LABEL: f64to4f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to4f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [7:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = fptrunc <4 x double> %b to <4 x float> ret <4 x float> %a } define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) { ; GENERIC-LABEL: f64to4f32_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64to4f32_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [7:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = fptrunc <4 x double> %b to <4 x float> %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer ret <4 x float> %c } define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { ; GENERIC-LABEL: f64tof32_inreg: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64tof32_inreg: ; SKX: # %bb.0: ; SKX-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %ext = extractelement <2 x double> %a0, i32 0 %cvt = fptrunc double %ext to float %res = insertelement <4 x float> %a1, float %cvt, i32 0 ret <4 x float> %res } define <8 x double> @f32to8f64(<8 x float> %b) nounwind { ; GENERIC-LABEL: f32to8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32to8f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = fpext <8 x float> %b to <8 x double> ret <8 x double> %a } define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) { ; GENERIC-LABEL: f32to4f64_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcmpltpd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32to4f64_mask: ; SKX: # %bb.0: ; SKX-NEXT: vcmpltpd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = fpext <4 x float> %b to <4 x double> %mask = fcmp ogt <4 x double> %a1, %b1 %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer ret <4 x double> %c } define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { ; GENERIC-LABEL: f32tof64_inreg: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32tof64_inreg: ; SKX: # %bb.0: ; SKX-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %ext = extractelement <4 x float> %a1, i32 0 %cvt = fpext float %ext to double %res = insertelement <2 x double> %a0, double %cvt, i32 0 ret <2 x double> %res } define double @sltof64_load(i64* nocapture %e) { ; GENERIC-LABEL: sltof64_load: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sltof64_load: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp1 = load i64, i64* %e, align 8 %conv = sitofp i64 %tmp1 to double ret double %conv } define double @sitof64_load(i32* %e) { ; GENERIC-LABEL: sitof64_load: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 # sched: [9:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sitof64_load: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp1 = load i32, i32* %e, align 4 %conv = sitofp i32 %tmp1 to double ret double %conv } define float @sitof32_load(i32* %e) { ; GENERIC-LABEL: sitof32_load: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sitof32_load: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp1 = load i32, i32* %e, align 4 %conv = sitofp i32 %tmp1 to float ret float %conv } define float @sltof32_load(i64* %e) { ; GENERIC-LABEL: sltof32_load: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 # sched: [10:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sltof32_load: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp1 = load i64, i64* %e, align 8 %conv = sitofp i64 %tmp1 to float ret float %conv } define void @f32tof64_loadstore() { ; GENERIC-LABEL: f32tof64_loadstore: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] ; GENERIC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f32tof64_loadstore: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %f = alloca float, align 4 %d = alloca double, align 8 %tmp = load float, float* %f, align 4 %conv = fpext float %tmp to double store double %conv, double* %d, align 8 ret void } define void @f64tof32_loadstore() nounwind uwtable { ; GENERIC-LABEL: f64tof32_loadstore: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50] ; GENERIC-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: f64tof32_loadstore: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50] ; SKX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %f = alloca float, align 4 %d = alloca double, align 8 %tmp = load double, double* %d, align 8 %conv = fptrunc double %tmp to float store float %conv, float* %f, align 4 ret void } define double @long_to_double(i64 %x) { ; GENERIC-LABEL: long_to_double: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovq %rdi, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: long_to_double: ; SKX: # %bb.0: ; SKX-NEXT: vmovq %rdi, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast i64 %x to double ret double %res } define i64 @double_to_long(double %x) { ; GENERIC-LABEL: double_to_long: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovq %xmm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: double_to_long: ; SKX: # %bb.0: ; SKX-NEXT: vmovq %xmm0, %rax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast double %x to i64 ret i64 %res } define float @int_to_float(i32 %x) { ; GENERIC-LABEL: int_to_float: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovd %edi, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: int_to_float: ; SKX: # %bb.0: ; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast i32 %x to float ret float %res } define i32 @float_to_int(float %x) { ; GENERIC-LABEL: float_to_int: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovd %xmm0, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: float_to_int: ; SKX: # %bb.0: ; SKX-NEXT: vmovd %xmm0, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast float %x to i32 ret i32 %res } define <16 x double> @uito16f64(<16 x i32> %a) nounwind { ; GENERIC-LABEL: uito16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm2 # sched: [4:1.00] ; GENERIC-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uito16f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtudq2pd %ymm0, %zmm2 # sched: [7:1.00] ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [3:1.00] ; SKX-NEXT: vcvtudq2pd %ymm0, %zmm1 # sched: [7:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i32> %a to <16 x double> ret <16 x double> %b } define <8 x float> @slto8f32(<8 x i64> %a) { ; GENERIC-LABEL: slto8f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: slto8f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } define <16 x float> @slto16f32(<16 x i64> %a) { ; GENERIC-LABEL: slto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: vcvtqq2ps %zmm1, %ymm1 # sched: [4:1.00] ; GENERIC-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: slto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: vcvtqq2ps %zmm1, %ymm1 # sched: [7:1.00] ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i64> %a to <16 x float> ret <16 x float> %b } define <8 x double> @slto8f64(<8 x i64> %a) { ; GENERIC-LABEL: slto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: slto8f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } define <16 x double> @slto16f64(<16 x i64> %a) { ; GENERIC-LABEL: slto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vcvtqq2pd %zmm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: slto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vcvtqq2pd %zmm1, %zmm1 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } define <8 x float> @ulto8f32(<8 x i64> %a) { ; GENERIC-LABEL: ulto8f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ulto8f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } define <16 x float> @ulto16f32(<16 x i64> %a) { ; GENERIC-LABEL: ulto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: vcvtuqq2ps %zmm1, %ymm1 # sched: [4:1.00] ; GENERIC-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ulto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: vcvtuqq2ps %zmm1, %ymm1 # sched: [7:1.00] ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i64> %a to <16 x float> ret <16 x float> %b } define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { ; GENERIC-LABEL: uito8f64_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uito8f64_mask: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] ; VLNOBW-LABEL: uito8f64_mask: ; VLNOBW: # %bb.0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} ; VLNOBW-NEXT: ret{{[l|q]}} %1 = bitcast i8 %c to <8 x i1> %2 = uitofp <8 x i32> %b to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a ret <8 x double> %3 } define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { ; GENERIC-LABEL: uito8f64_maskz: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uito8f64_maskz: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = bitcast i8 %b to <8 x i1> %2 = uitofp <8 x i32> %a to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer ret <8 x double> %3 } define <4 x double> @uito4f64(<4 x i32> %a) nounwind { ; GENERIC-LABEL: uito4f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtudq2pd %xmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uito4f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtudq2pd %xmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <4 x i32> %a to <4 x double> ret <4 x double> %b } define <16 x float> @uito16f32(<16 x i32> %a) nounwind { ; GENERIC-LABEL: uito16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtudq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uito16f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtudq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } define <8 x double> @uito8f64(<8 x i32> %a) { ; GENERIC-LABEL: uito8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uito8f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i32> %a to <8 x double> ret <8 x double> %b } define <8 x float> @uito8f32(<8 x i32> %a) nounwind { ; GENERIC-LABEL: uito8f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtudq2ps %ymm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uito8f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtudq2ps %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i32> %a to <8 x float> ret <8 x float> %b } define <4 x float> @uito4f32(<4 x i32> %a) nounwind { ; GENERIC-LABEL: uito4f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uito4f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <4 x i32> %a to <4 x float> ret <4 x float> %b } define i32 @fptosi(float %a) nounwind { ; GENERIC-LABEL: fptosi: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttss2si %xmm0, %eax # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: fptosi: ; SKX: # %bb.0: ; SKX-NEXT: vcvttss2si %xmm0, %eax # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi float %a to i32 ret i32 %b } define i32 @fptoui(float %a) nounwind { ; GENERIC-LABEL: fptoui: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvttss2usi %xmm0, %eax # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: fptoui: ; SKX: # %bb.0: ; SKX-NEXT: vcvttss2usi %xmm0, %eax # sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui float %a to i32 ret i32 %b } define float @uitof32(i32 %a) nounwind { ; GENERIC-LABEL: uitof32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uitof32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp i32 %a to float ret float %b } define double @uitof64(i32 %a) nounwind { ; GENERIC-LABEL: uitof64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uitof64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp i32 %a to double ret double %b } define <16 x float> @sbto16f32(<16 x i32> %a) { ; GENERIC-LABEL: sbto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = sitofp <16 x i1> %mask to <16 x float> ret <16 x float> %1 } define <16 x float> @scto16f32(<16 x i8> %a) { ; GENERIC-LABEL: scto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: scto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %1 = sitofp <16 x i8> %a to <16 x float> ret <16 x float> %1 } define <16 x float> @ssto16f32(<16 x i16> %a) { ; GENERIC-LABEL: ssto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ssto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %1 = sitofp <16 x i16> %a to <16 x float> ret <16 x float> %1 } define <8 x double> @ssto16f64(<8 x i16> %a) { ; GENERIC-LABEL: ssto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ssto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 } define <8 x double> @scto8f64(<8 x i8> %a) { ; GENERIC-LABEL: scto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: vpslld $24, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpsrad $24, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: scto8f64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] ; SKX-NEXT: vpslld $24, %ymm0, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vpsrad $24, %ymm0, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = sitofp <8 x i8> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @scto16f64(<16 x i8> %a) { ; GENERIC-LABEL: scto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: scto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbd %xmm0, %zmm1 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i8> %a to <16 x double> ret <16 x double> %b } define <16 x double> @sbto16f64(<16 x double> %a) { ; GENERIC-LABEL: sbto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vcmpltpd %zmm1, %zmm2, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vcmpltpd %zmm0, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k1, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vcmpltpd %zmm1, %zmm2, %k0 # sched: [3:1.00] ; SKX-NEXT: vcmpltpd %zmm0, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <16 x double> %a, zeroinitializer %1 = sitofp <16 x i1> %cmpres to <16 x double> ret <16 x double> %1 } define <8 x double> @sbto8f64(<8 x double> %a) { ; GENERIC-LABEL: sbto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpltpd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto8f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <8 x double> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x double> ret <8 x double> %1 } define <8 x float> @sbto8f32(<8 x float> %a) { ; GENERIC-LABEL: sbto8f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpltps %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto8f32: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpltps %ymm0, %ymm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <8 x float> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x float> ret <8 x float> %1 } define <4 x float> @sbto4f32(<4 x float> %a) { ; GENERIC-LABEL: sbto4f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto4f32: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <4 x float> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x float> ret <4 x float> %1 } define <4 x double> @sbto4f64(<4 x double> %a) { ; GENERIC-LABEL: sbto4f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpltpd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto4f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpltpd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <4 x double> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x double> ret <4 x double> %1 } define <2 x float> @sbto2f32(<2 x float> %a) { ; GENERIC-LABEL: sbto2f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto2f32: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x float> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x float> ret <2 x float> %1 } define <2 x double> @sbto2f64(<2 x double> %a) { ; GENERIC-LABEL: sbto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] -; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25] -; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x double> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x double> ret <2 x double> %1 } define <16 x float> @ucto16f32(<16 x i8> %a) { ; GENERIC-LABEL: ucto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ucto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i8> %a to <16 x float> ret <16 x float>%b } define <8 x double> @ucto8f64(<8 x i8> %a) { ; GENERIC-LABEL: ucto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] ; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ucto8f64: ; SKX: # %bb.0: ; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i8> %a to <8 x double> ret <8 x double> %b } define <16 x float> @swto16f32(<16 x i16> %a) { ; GENERIC-LABEL: swto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: swto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } define <8 x double> @swto8f64(<8 x i16> %a) { ; GENERIC-LABEL: swto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: swto8f64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %b } define <16 x double> @swto16f64(<16 x i16> %a) { ; GENERIC-LABEL: swto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: swto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwd %ymm0, %zmm1 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i16> %a to <16 x double> ret <16 x double> %b } define <16 x double> @ucto16f64(<16 x i8> %a) { ; GENERIC-LABEL: ucto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ucto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i8> %a to <16 x double> ret <16 x double> %b } define <16 x float> @uwto16f32(<16 x i16> %a) { ; GENERIC-LABEL: uwto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uwto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } define <8 x double> @uwto8f64(<8 x i16> %a) { ; GENERIC-LABEL: uwto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uwto8f64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i16> %a to <8 x double> ret <8 x double> %b } define <16 x double> @uwto16f64(<16 x i16> %a) { ; GENERIC-LABEL: uwto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: uwto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i16> %a to <16 x double> ret <16 x double> %b } define <16 x float> @sito16f32(<16 x i32> %a) { ; GENERIC-LABEL: sito16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sito16f32: ; SKX: # %bb.0: ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } define <16 x double> @sito16f64(<16 x i32> %a) { ; GENERIC-LABEL: sito16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm2 # sched: [4:1.00] ; GENERIC-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sito16f64: ; SKX: # %bb.0: ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm2 # sched: [7:1.00] ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm1 # sched: [7:1.00] ; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i32> %a to <16 x double> ret <16 x double> %b } define <16 x float> @usto16f32(<16 x i16> %a) { ; GENERIC-LABEL: usto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: usto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } define <16 x float> @ubto16f32(<16 x i32> %a) { ; GENERIC-LABEL: ubto16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto16f32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x float> ret <16 x float> %1 } define <16 x double> @ubto16f64(<16 x i32> %a) { ; GENERIC-LABEL: ubto16f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50] ; GENERIC-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: kshiftrw $8, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto16f64: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50] ; SKX-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: kshiftrw $8, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x double> ret <16 x double> %1 } define <8 x float> @ubto8f32(<8 x i32> %a) { ; GENERIC-LABEL: ubto8f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto8f32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x float> ret <8 x float> %1 } define <8 x double> @ubto8f64(<8 x i32> %a) { ; GENERIC-LABEL: ubto8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto8f64: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x double> ret <8 x double> %1 } define <4 x float> @ubto4f32(<4 x i32> %a) { ; GENERIC-LABEL: ubto4f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto4f32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] ; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x float> ret <4 x float> %1 } define <4 x double> @ubto4f64(<4 x i32> %a) { ; GENERIC-LABEL: ubto4f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto4f64: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] ; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x double> ret <4 x double> %1 } define <2 x float> @ubto2f32(<2 x i32> %a) { ; GENERIC-LABEL: ubto2f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] ; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] ; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] ; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 } define <2 x double> @ubto2f64(<2 x i32> %a) { ; GENERIC-LABEL: ubto2f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] ; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50] -; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00] +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] ; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] -; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 } define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_8x8mem_to_8x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x8mem_to_8x16: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = zext <8 x i8> %a to <8 x i16> %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer ret <8 x i16> %ret } define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_8x8mem_to_8x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x16: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i16> %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer ret <8 x i16> %ret } define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_16x8mem_to_16x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8mem_to_16x16: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = zext <16 x i8> %a to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_16x8mem_to_16x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8mem_to_16x16: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = sext <16 x i8> %a to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { ; GENERIC-LABEL: zext_16x8_to_16x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8_to_16x16: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i8> %a to <16 x i16> ret <16 x i16> %x } define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_16x8_to_16x16_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8_to_16x16_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i8> %a to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { ; GENERIC-LABEL: sext_16x8_to_16x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8_to_16x16: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = sext <16 x i8> %a to <16 x i16> ret <16 x i16> %x } define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_16x8_to_16x16_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8_to_16x16_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = sext <16 x i8> %a to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_32x8mem_to_32x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8mem_to_32x16: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <32 x i8>,<32 x i8> *%i,align 1 %x = zext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_32x8mem_to_32x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_32x8mem_to_32x16: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <32 x i8>,<32 x i8> *%i,align 1 %x = sext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; GENERIC-LABEL: zext_32x8_to_32x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8_to_32x16: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <32 x i8> %a to <32 x i16> ret <32 x i16> %x } define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_32x8_to_32x16_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8_to_32x16_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; GENERIC-LABEL: sext_32x8_to_32x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_32x8_to_32x16: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = sext <32 x i8> %a to <32 x i16> ret <32 x i16> %x } define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_32x8_to_32x16_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_32x8_to_32x16_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = sext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_4x8mem_to_4x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x8mem_to_4x32: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = zext <4 x i8> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer ret <4 x i32> %ret } define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_4x8mem_to_4x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x8mem_to_4x32: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = sext <4 x i8> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer ret <4 x i32> %ret } define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_8x8mem_to_8x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x8mem_to_8x32: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = zext <8 x i8> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_8x8mem_to_8x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x32: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_16x8mem_to_16x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8mem_to_16x32: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = zext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_16x8mem_to_16x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8mem_to_16x32: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = sext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_16x8_to_16x32_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8_to_16x32_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_16x8_to_16x32_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8_to_16x32_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = sext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { ; GENERIC-LABEL: zext_16x8_to_16x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x8_to_16x32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i8> %i to <16 x i32> ret <16 x i32> %x } define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { ; GENERIC-LABEL: sext_16x8_to_16x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x8_to_16x32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = sext <16 x i8> %i to <16 x i32> ret <16 x i32> %x } define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_2x8mem_to_2x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2x8mem_to_2x64: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i8>,<2 x i8> *%i,align 1 %x = zext <2 x i8> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_2x8mem_to_2x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x8mem_to_2x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i8>,<2 x i8> *%i,align 1 %x = sext <2 x i8> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone { ; GENERIC-LABEL: sext_2x8mem_to_2x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x8mem_to_2x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 # sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i8>,<2 x i8> *%i,align 1 %x = sext <2 x i8> %a to <2 x i64> ret <2 x i64> %x } define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_4x8mem_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x8mem_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = zext <4 x i8> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_4x8mem_to_4x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x8mem_to_4x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = sext <4 x i8> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone { ; GENERIC-LABEL: sext_4x8mem_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x8mem_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 # sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = sext <4 x i8> %a to <4 x i64> ret <4 x i64> %x } define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_8x8mem_to_8x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x8mem_to_8x64: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = zext <8 x i8> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_8x8mem_to_8x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x8mem_to_8x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x8mem_to_8x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i64> ret <8 x i64> %x } define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_4x16mem_to_4x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x16mem_to_4x32: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = zext <4 x i16> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer ret <4 x i32> %ret } define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_4x16mem_to_4x32mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x16mem_to_4x32mask: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer ret <4 x i32> %ret } define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_4x16mem_to_4x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x16mem_to_4x32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 # sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i32> ret <4 x i32> %x } define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_8x16mem_to_8x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16mem_to_8x32: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = zext <8 x i16> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_8x16mem_to_8x32mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x32mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x16mem_to_8x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i32> ret <8 x i32> %x } define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_8x16_to_8x32mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16_to_8x32mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <8 x i16> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone { ; GENERIC-LABEL: zext_8x16_to_8x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16_to_8x32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <8 x i16> %a to <8 x i32> ret <8 x i32> %x } define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_16x16mem_to_16x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x16mem_to_16x32: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i16>,<16 x i16> *%i,align 1 %x = zext <16 x i16> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_16x16mem_to_16x32mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x16mem_to_16x32mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i16>,<16 x i16> *%i,align 1 %x = sext <16 x i16> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_16x16mem_to_16x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16x16mem_to_16x32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i16>,<16 x i16> *%i,align 1 %x = sext <16 x i16> %a to <16 x i32> ret <16 x i32> %x } define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_16x16_to_16x32mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x16_to_16x32mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i16> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone { ; GENERIC-LABEL: zext_16x16_to_16x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x16_to_16x32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i16> %a to <16 x i32> ret <16 x i32> %x } define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_2x16mem_to_2x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2x16mem_to_2x64: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i16>,<2 x i16> *%i,align 1 %x = zext <2 x i16> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_2x16mem_to_2x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x16mem_to_2x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i16>,<2 x i16> *%i,align 1 %x = sext <2 x i16> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_2x16mem_to_2x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x16mem_to_2x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 # sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i16>,<2 x i16> *%i,align 1 %x = sext <2 x i16> %a to <2 x i64> ret <2 x i64> %x } define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_4x16mem_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x16mem_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = zext <4 x i16> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_4x16mem_to_4x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x16mem_to_4x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_4x16mem_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x16mem_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 # sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i64> ret <4 x i64> %x } define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_8x16mem_to_8x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16mem_to_8x64: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = zext <8 x i16> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_8x16mem_to_8x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x16mem_to_8x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x16mem_to_8x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i64> ret <8 x i64> %x } define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_8x16_to_8x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16_to_8x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <8 x i16> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone { ; GENERIC-LABEL: zext_8x16_to_8x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x16_to_8x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %ret = zext <8 x i16> %a to <8 x i64> ret <8 x i64> %ret } define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_2x32mem_to_2x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2x32mem_to_2x64: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i32>,<2 x i32> *%i,align 1 %x = zext <2 x i32> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_2x32mem_to_2x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x32mem_to_2x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i32>,<2 x i32> *%i,align 1 %x = sext <2 x i32> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone { ; GENERIC-LABEL: sext_2x32mem_to_2x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_2x32mem_to_2x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 # sched: [6:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i32>,<2 x i32> *%i,align 1 %x = sext <2 x i32> %a to <2 x i64> ret <2 x i64> %x } define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_4x32mem_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x32mem_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i32>,<4 x i32> *%i,align 1 %x = zext <4 x i32> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_4x32mem_to_4x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x32mem_to_4x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i32>,<4 x i32> *%i,align 1 %x = sext <4 x i32> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone { ; GENERIC-LABEL: sext_4x32mem_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x32mem_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 # sched: [9:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i32>,<4 x i32> *%i,align 1 %x = sext <4 x i32> %a to <4 x i64> ret <4 x i64> %x } define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone { ; GENERIC-LABEL: sext_4x32_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_4x32_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = sext <4 x i32> %a to <4 x i64> ret <4 x i64> %x } define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_4x32_to_4x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4x32_to_4x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <4 x i32> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_8x32mem_to_8x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x32mem_to_8x64: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i32>,<8 x i32> *%i,align 1 %x = zext <8 x i32> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: sext_8x32mem_to_8x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x32mem_to_8x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i32>,<8 x i32> *%i,align 1 %x = sext <8 x i32> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone { ; GENERIC-LABEL: sext_8x32mem_to_8x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x32mem_to_8x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 # sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i32>,<8 x i32> *%i,align 1 %x = sext <8 x i32> %a to <8 x i64> ret <8 x i64> %x } define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone { ; GENERIC-LABEL: sext_8x32_to_8x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxdq %ymm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8x32_to_8x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxdq %ymm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = sext <8 x i32> %a to <8 x i64> ret <8 x i64> %x } define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: zext_8x32_to_8x64mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x32_to_8x64mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = zext <8 x i32> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone { ; GENERIC-LABEL: fptrunc_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: fptrunc_test: ; SKX: # %bb.0: ; SKX-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fptrunc <8 x double> %a to <8 x float> ret <8 x float> %b } define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone { ; GENERIC-LABEL: fpext_test: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: fpext_test: ; SKX: # %bb.0: ; SKX-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [7:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = fpext <8 x float> %a to <8 x double> ret <8 x double> %b } define <16 x i32> @zext_16i1_to_16xi32(i16 %b) { ; GENERIC-LABEL: zext_16i1_to_16xi32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16i1_to_16xi32: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i16 %b to <16 x i1> %c = zext <16 x i1> %a to <16 x i32> ret <16 x i32> %c } define <8 x i64> @zext_8i1_to_8xi64(i8 %b) { ; GENERIC-LABEL: zext_8i1_to_8xi64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8i1_to_8xi64: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i8 %b to <8 x i1> %c = zext <8 x i1> %a to <8 x i64> ret <8 x i64> %c } define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { ; GENERIC-LABEL: trunc_16i8_to_16i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: trunc_16i8_to_16i1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %ax killed %ax killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %mask_b = trunc <16 x i8>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 ret i16 %mask } define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { ; GENERIC-LABEL: trunc_16i32_to_16i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: trunc_16i32_to_16i1: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %ax killed %ax killed %eax ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask_b = trunc <16 x i32>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 ret i16 %mask } define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { ; GENERIC-LABEL: trunc_4i32_to_4i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpsrad $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: trunc_4i32_to_4i1: ; SKX: # %bb.0: ; SKX-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpsrad $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask_a = trunc <4 x i32>%a to <4 x i1> %mask_b = trunc <4 x i32>%b to <4 x i1> %a_and_b = and <4 x i1>%mask_a, %mask_b %res = sext <4 x i1>%a_and_b to <4 x i32> ret <4 x i32>%res } define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { ; GENERIC-LABEL: trunc_8i16_to_8i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %al killed %al killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: trunc_8i16_to_8i1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %al killed %al killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %mask_b = trunc <8 x i16>%a to <8 x i1> %mask = bitcast <8 x i1> %mask_b to i8 ret i8 %mask } define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpled %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8i1_8i32: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpled %ymm0, %ymm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %x = icmp slt <8 x i32> %a1, %a2 %x1 = xor <8 x i1>%x, %y = sext <8 x i1> %x1 to <8 x i32> ret <8 x i32> %y } define i16 @trunc_i32_to_i1(i32 %a) { ; GENERIC-LABEL: trunc_i32_to_i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: movw $-4, %ax # sched: [1:0.33] ; GENERIC-NEXT: kmovd %eax, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kshiftrw $1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kshiftlw $1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33] ; GENERIC-NEXT: kmovw %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: trunc_i32_to_i1: ; SKX: # %bb.0: ; SKX-NEXT: movw $-4, %ax # sched: [1:0.25] ; SKX-NEXT: kmovd %eax, %k0 # sched: [1:1.00] ; SKX-NEXT: kshiftrw $1, %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: kshiftlw $1, %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: andl $1, %edi # sched: [1:0.25] ; SKX-NEXT: kmovw %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %ax killed %ax killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %a_i = trunc i32 %a to i1 %maskv = insertelement <16 x i1> , i1 %a_i, i32 0 %res = bitcast <16 x i1> %maskv to i16 ret i16 %res } define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8i1_8i16: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = icmp slt <8 x i32> %a1, %a2 %y = sext <8 x i1> %x to <8 x i16> ret <8 x i16> %y } define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_16i1_16i32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_16i1_16i32: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %x = icmp slt <16 x i32> %a1, %a2 %y = sext <16 x i1> %x to <16 x i32> ret <16 x i32> %y } define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind { ; GENERIC-LABEL: sext_8i1_8i64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sext_8i1_8i64: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %x = icmp slt <8 x i32> %a1, %a2 %y = sext <8 x i1> %x to <8 x i64> ret <8 x i64> %y } define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { ; GENERIC-LABEL: extload_v8i64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: extload_v8i64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [10:1.00] ; SKX-NEXT: vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %sign_load = load <8 x i8>, <8 x i8>* %a %c = sext <8 x i8> %sign_load to <8 x i64> store <8 x i64> %c, <8 x i64>* %res ret void } define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: test21: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [3:1.00] ; GENERIC-NEXT: vpmovb2m %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: kshiftrq $32, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test21: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %zmm2, %k1 # sched: [1:1.00] ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: kshiftrq $32, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer ret <64 x i16> %ret } define <16 x i16> @shuffle_zext_16x8_to_16x16(<16 x i8> %a) nounwind readnone { ; GENERIC-LABEL: shuffle_zext_16x8_to_16x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: shuffle_zext_16x8_to_16x16: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> %2 = bitcast <32 x i8> %1 to <16 x i16> ret <16 x i16> %2 } define <16 x i16> @shuffle_zext_16x8_to_16x16_mask(<16 x i8> %a, <16 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: shuffle_zext_16x8_to_16x16_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: shuffle_zext_16x8_to_16x16_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> %bc = bitcast <32 x i8> %x to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %bc, <16 x i16> zeroinitializer ret <16 x i16> %ret } define <16 x i16> @zext_32x8_to_16x16(<32 x i8> %a) { ; GENERIC-LABEL: zext_32x8_to_16x16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8_to_16x16: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> %2 = bitcast <32 x i8> %1 to <16 x i16> ret <16 x i16> %2 } define <8 x i32> @zext_32x8_to_8x32(<32 x i8> %a) { ; GENERIC-LABEL: zext_32x8_to_8x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8_to_8x32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> %2 = bitcast <32 x i8> %1 to <8 x i32> ret <8 x i32> %2 } define <4 x i64> @zext_32x8_to_4x64(<32 x i8> %a) { ; GENERIC-LABEL: zext_32x8_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32x8_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> %2 = bitcast <32 x i8> %1 to <4 x i64> ret <4 x i64> %2 } define <8 x i32> @zext_16x16_to_8x32(<16 x i16> %a) { ; GENERIC-LABEL: zext_16x16_to_8x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x16_to_8x32: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> %2 = bitcast <16 x i16> %1 to <8 x i32> ret <8 x i32> %2 } define <4 x i64> @zext_16x16_to_4x64(<16 x i16> %a) { ; GENERIC-LABEL: zext_16x16_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16x16_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> %2 = bitcast <16 x i16> %1 to <4 x i64> ret <4 x i64> %2 } define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) { ; GENERIC-LABEL: zext_8x32_to_4x64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_8x32_to_4x64: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> %2 = bitcast <8 x i32> %1 to <4 x i64> ret <4 x i64> %2 } define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { ; GENERIC-LABEL: zext_64xi1_to_64xi8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_64xi1_to_64xi8: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <64 x i8> %x, %y %1 = zext <64 x i1> %mask to <64 x i8> ret <64 x i8> %1 } define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 { ; GENERIC-LABEL: zext_32xi1_to_32xi16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32xi1_to_32xi16: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i16> ret <32 x i16> %1 } define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 { ; GENERIC-LABEL: zext_16xi1_to_16xi16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_16xi1_to_16xi16: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <16 x i16> %x, %y %1 = zext <16 x i1> %mask to <16 x i16> ret <16 x i16> %1 } define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { ; GENERIC-LABEL: zext_32xi1_to_32xi8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32xi1_to_32xi8: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i8> ret <32 x i8> %1 } define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { ; GENERIC-LABEL: zext_4xi1_to_4x32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50] ; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_4xi1_to_4x32: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50] ; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <4 x i8> %x, %y %1 = zext <4 x i1> %mask to <4 x i32> ret <4 x i32> %1 } define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 { ; GENERIC-LABEL: zext_2xi1_to_2xi64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50] ; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_2xi1_to_2xi64: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50] ; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <2 x i8> %x, %y %1 = zext <2 x i1> %mask to <2 x i64> ret <2 x i64> %1 } define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; GENERIC-LABEL: test_x86_fmadd_ps_z: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmadd_ps_z: ; SKX: # %bb.0: ; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vaddps %zmm2, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <16 x float> %a0, %a1 %res = fadd <16 x float> %x, %a2 ret <16 x float> %res } define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; GENERIC-LABEL: test_x86_fmsub_ps_z: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmsub_ps_z: ; SKX: # %bb.0: ; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <16 x float> %a0, %a1 %res = fsub <16 x float> %x, %a2 ret <16 x float> %res } define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; GENERIC-LABEL: test_x86_fnmadd_ps_z: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vsubps %zmm0, %zmm2, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fnmadd_ps_z: ; SKX: # %bb.0: ; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vsubps %zmm0, %zmm2, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <16 x float> %a0, %a1 %res = fsub <16 x float> %a2, %x ret <16 x float> %res } define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; GENERIC-LABEL: test_x86_fnmsub_ps_z: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fnmsub_ps_z: ; SKX: # %bb.0: ; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <16 x float> %a0, %a1 %y = fsub <16 x float> , %x %res = fsub <16 x float> %y, %a2 ret <16 x float> %res } define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; GENERIC-LABEL: test_x86_fmadd_pd_z: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmadd_pd_z: ; SKX: # %bb.0: ; SKX-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vaddpd %zmm2, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <8 x double> %a0, %a1 %res = fadd <8 x double> %x, %a2 ret <8 x double> %res } define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; GENERIC-LABEL: test_x86_fmsub_pd_z: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vsubpd %zmm2, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmsub_pd_z: ; SKX: # %bb.0: ; SKX-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vsubpd %zmm2, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <8 x double> %a0, %a1 %res = fsub <8 x double> %x, %a2 ret <8 x double> %res } define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) { ; GENERIC-LABEL: test_x86_fmsub_213: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] ; GENERIC-NEXT: vsubsd %xmm2, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmsub_213: ; SKX: # %bb.0: ; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vsubsd %xmm2, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %x = fmul double %a0, %a1 %res = fsub double %x, %a2 ret double %res } define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) { ; GENERIC-LABEL: test_x86_fmsub_213_m: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] ; GENERIC-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmsub_213_m: ; SKX: # %bb.0: ; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a2 = load double , double *%a2_ptr %x = fmul double %a0, %a1 %res = fsub double %x, %a2 ret double %res } define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) { ; GENERIC-LABEL: test_x86_fmsub_231_m: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00] ; GENERIC-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmsub_231_m: ; SKX: # %bb.0: ; SKX-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] ; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a2 = load double , double *%a2_ptr %x = fmul double %a0, %a2 %res = fsub double %x, %a1 ret double %res } define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind { ; GENERIC-LABEL: test231_br: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [9:1.00] ; GENERIC-NEXT: vaddps %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test231_br: ; SKX: # %bb.0: ; SKX-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b1 = fmul <16 x float> %a1, %b2 = fadd <16 x float> %b1, %a2 ret <16 x float> %b2 } define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind { ; GENERIC-LABEL: test213_br: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test213_br: ; SKX: # %bb.0: ; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %b1 = fmul <16 x float> %a1, %a2 %b2 = fadd <16 x float> %b1, ret <16 x float> %b2 } ;mask (a*c+b , a) define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { ; GENERIC-LABEL: test_x86_fmadd132_ps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmulps (%rdi), %zmm0, %zmm2 # sched: [9:1.00] ; GENERIC-NEXT: vaddps %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmadd132_ps: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00] ; SKX-NEXT: vmulps (%rdi), %zmm0, %zmm2 # sched: [11:0.50] ; SKX-NEXT: vaddps %zmm1, %zmm2, %zmm0 {%k1} # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 %x = fmul <16 x float> %a0, %a2 %y = fadd <16 x float> %x, %a1 %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a0 ret <16 x float> %res } ;mask (a*c+b , b) define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { ; GENERIC-LABEL: test_x86_fmadd231_ps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmulps (%rdi), %zmm0, %zmm0 # sched: [9:1.00] ; GENERIC-NEXT: vaddps %zmm1, %zmm0, %zmm1 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmadd231_ps: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00] ; SKX-NEXT: vmulps (%rdi), %zmm0, %zmm0 # sched: [11:0.50] ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm1 {%k1} # sched: [4:0.33] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 %x = fmul <16 x float> %a0, %a2 %y = fadd <16 x float> %x, %a1 %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1 ret <16 x float> %res } ;mask (b*a+c , b) define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { ; GENERIC-LABEL: test_x86_fmadd213_ps: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: vaddps (%rdi), %zmm0, %zmm1 {%k1} # sched: [7:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_x86_fmadd213_ps: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00] ; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] ; SKX-NEXT: vaddps (%rdi), %zmm0, %zmm1 {%k1} # sched: [11:0.50] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 %x = fmul <16 x float> %a1, %a0 %y = fadd <16 x float> %x, %a2 %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1 ret <16 x float> %res } define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpandd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <16 x i32> %a, %x = and <16 x i32> %a2, %b ret <16 x i32> %x } define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpandnd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandnd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <16 x i32> %a, %b2 = xor <16 x i32> %b, %x = and <16 x i32> %a2, %b2 ret <16 x i32> %x } define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpord: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpord: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <16 x i32> %a, %x = or <16 x i32> %a2, %b ret <16 x i32> %x } define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpxord: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpxord: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <16 x i32> %a, %x = xor <16 x i32> %a2, %b ret <16 x i32> %x } define <8 x i64> @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpandq: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandq: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <8 x i64> %a, %x = and <8 x i64> %a2, %b ret <8 x i64> %x } define <8 x i64> @vpandnq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpandnq: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpandnq: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <8 x i64> %a, %b2 = xor <8 x i64> %b, %x = and <8 x i64> %a2, %b2 ret <8 x i64> %x } define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vporq: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vporq: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <8 x i64> %a, %x = or <8 x i64> %a2, %b ret <8 x i64> %x } define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { ; GENERIC-LABEL: vpxorq: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00] ; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpxorq: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] ; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <8 x i64> %a, %x = xor <8 x i64> %a2, %b ret <8 x i64> %x } define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) { ; GENERIC-LABEL: and_v64i8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: and_v64i8: ; SKX: # %bb.0: ; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %res = and <64 x i8> %a, %b ret <64 x i8> %res } define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) { ; GENERIC-LABEL: andn_v64i8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: andn_v64i8: ; SKX: # %bb.0: ; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b2 = xor <64 x i8> %b, %res = and <64 x i8> %a, %b2 ret <64 x i8> %res } define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) { ; GENERIC-LABEL: or_v64i8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: or_v64i8: ; SKX: # %bb.0: ; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %res = or <64 x i8> %a, %b ret <64 x i8> %res } define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) { ; GENERIC-LABEL: xor_v64i8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: xor_v64i8: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %res = xor <64 x i8> %a, %b ret <64 x i8> %res } define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) { ; GENERIC-LABEL: and_v32i16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: and_v32i16: ; SKX: # %bb.0: ; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %res = and <32 x i16> %a, %b ret <32 x i16> %res } define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) { ; GENERIC-LABEL: andn_v32i16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: andn_v32i16: ; SKX: # %bb.0: ; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %b2 = xor <32 x i16> %b, %res = and <32 x i16> %a, %b2 ret <32 x i16> %res } define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) { ; GENERIC-LABEL: or_v32i16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: or_v32i16: ; SKX: # %bb.0: ; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %res = or <32 x i16> %a, %b ret <32 x i16> %res } define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) { ; GENERIC-LABEL: xor_v32i16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: xor_v32i16: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %res = xor <32 x i16> %a, %b ret <32 x i16> %res } define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { ; GENERIC-LABEL: masked_and_v16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: masked_and_v16f32: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] ; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <16 x float> %a to <16 x i32> %b1 = bitcast <16 x float> %b to <16 x i32> %passThru1 = bitcast <16 x float> %passThru to <16 x i32> %mask1 = bitcast i16 %mask to <16 x i1> %op = and <16 x i32> %a1, %b1 %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1 %cast = bitcast <16 x i32> %select to <16 x float> %add = fadd <16 x float> %c, %cast ret <16 x float> %add } define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { ; GENERIC-LABEL: masked_or_v16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: masked_or_v16f32: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] ; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <16 x float> %a to <16 x i32> %b1 = bitcast <16 x float> %b to <16 x i32> %passThru1 = bitcast <16 x float> %passThru to <16 x i32> %mask1 = bitcast i16 %mask to <16 x i1> %op = and <16 x i32> %a1, %b1 %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1 %cast = bitcast <16 x i32> %select to <16 x float> %add = fadd <16 x float> %c, %cast ret <16 x float> %add } define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { ; GENERIC-LABEL: masked_xor_v16f32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: masked_xor_v16f32: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] ; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <16 x float> %a to <16 x i32> %b1 = bitcast <16 x float> %b to <16 x i32> %passThru1 = bitcast <16 x float> %passThru to <16 x i32> %mask1 = bitcast i16 %mask to <16 x i1> %op = and <16 x i32> %a1, %b1 %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1 %cast = bitcast <16 x i32> %select to <16 x float> %add = fadd <16 x float> %c, %cast ret <16 x float> %add } define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { ; GENERIC-LABEL: masked_and_v8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: masked_and_v8f64: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] ; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <8 x double> %a to <8 x i64> %b1 = bitcast <8 x double> %b to <8 x i64> %passThru1 = bitcast <8 x double> %passThru to <8 x i64> %mask1 = bitcast i8 %mask to <8 x i1> %op = and <8 x i64> %a1, %b1 %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1 %cast = bitcast <8 x i64> %select to <8 x double> %add = fadd <8 x double> %c, %cast ret <8 x double> %add } define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { ; GENERIC-LABEL: masked_or_v8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: masked_or_v8f64: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] ; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <8 x double> %a to <8 x i64> %b1 = bitcast <8 x double> %b to <8 x i64> %passThru1 = bitcast <8 x double> %passThru to <8 x i64> %mask1 = bitcast i8 %mask to <8 x i1> %op = and <8 x i64> %a1, %b1 %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1 %cast = bitcast <8 x i64> %select to <8 x double> %add = fadd <8 x double> %c, %cast ret <8 x double> %add } define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { ; GENERIC-LABEL: masked_xor_v8f64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: masked_xor_v8f64: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] ; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <8 x double> %a to <8 x i64> %b1 = bitcast <8 x double> %b to <8 x i64> %passThru1 = bitcast <8 x double> %passThru to <8 x i64> %mask1 = bitcast i8 %mask to <8 x i1> %op = and <8 x i64> %a1, %b1 %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1 %cast = bitcast <8 x i64> %select to <8 x double> %add = fadd <8 x double> %c, %cast ret <8 x double> %add } define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; GENERIC-LABEL: test_mm512_mask_and_epi32: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_and_epi32: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %and1.i.i = and <8 x i64> %__a, %__b %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> %1 = bitcast <8 x i64> %__src to <16 x i32> %2 = bitcast i16 %__k to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1 %4 = bitcast <16 x i32> %3 to <8 x i64> ret <8 x i64> %4 } define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; GENERIC-LABEL: test_mm512_mask_or_epi32: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_or_epi32: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %or1.i.i = or <8 x i64> %__a, %__b %0 = bitcast <8 x i64> %or1.i.i to <16 x i32> %1 = bitcast <8 x i64> %__src to <16 x i32> %2 = bitcast i16 %__k to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1 %4 = bitcast <16 x i32> %3 to <8 x i64> ret <8 x i64> %4 } define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { ; GENERIC-LABEL: test_mm512_mask_xor_epi32: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_xor_epi32: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %xor1.i.i = xor <8 x i64> %__a, %__b %0 = bitcast <8 x i64> %xor1.i.i to <16 x i32> %1 = bitcast <8 x i64> %__src to <16 x i32> %2 = bitcast i16 %__k to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1 %4 = bitcast <16 x i32> %3 to <8 x i64> ret <8 x i64> %4 } define <8 x double> @test_mm512_mask_xor_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_xor_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_xor_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> %xor.i.i = xor <8 x i64> %0, %1 %2 = bitcast <8 x i64> %xor.i.i to <8 x double> %3 = bitcast i8 %__U to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W ret <8 x double> %4 } define <8 x double> @test_mm512_maskz_xor_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_xor_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_xor_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> %xor.i.i = xor <8 x i64> %0, %1 %2 = bitcast <8 x i64> %xor.i.i to <8 x double> %3 = bitcast i8 %__U to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer ret <8 x double> %4 } define <16 x float> @test_mm512_mask_xor_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_xor_ps: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_xor_ps: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> %xor.i.i = xor <16 x i32> %0, %1 %2 = bitcast <16 x i32> %xor.i.i to <16 x float> %3 = bitcast i16 %__U to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W ret <16 x float> %4 } define <16 x float> @test_mm512_maskz_xor_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_xor_ps: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_xor_ps: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> %xor.i.i = xor <16 x i32> %0, %1 %2 = bitcast <16 x i32> %xor.i.i to <16 x float> %3 = bitcast i16 %__U to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer ret <16 x float> %4 } define <8 x double> @test_mm512_mask_or_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_or_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_or_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> %or.i.i = or <8 x i64> %1, %0 %2 = bitcast <8 x i64> %or.i.i to <8 x double> %3 = bitcast i8 %__U to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W ret <8 x double> %4 } define <8 x double> @test_mm512_maskz_or_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_or_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_or_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> %or.i.i = or <8 x i64> %1, %0 %2 = bitcast <8 x i64> %or.i.i to <8 x double> %3 = bitcast i8 %__U to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer ret <8 x double> %4 } define <16 x float> @test_mm512_mask_or_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_or_ps: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_or_ps: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> %or.i.i = or <16 x i32> %1, %0 %2 = bitcast <16 x i32> %or.i.i to <16 x float> %3 = bitcast i16 %__U to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W ret <16 x float> %4 } define <16 x float> @test_mm512_maskz_or_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_or_ps: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_or_ps: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> %or.i.i = or <16 x i32> %1, %0 %2 = bitcast <16 x i32> %or.i.i to <16 x float> %3 = bitcast i16 %__U to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer ret <16 x float> %4 } define <8 x double> @test_mm512_mask_and_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_and_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_and_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> %and.i.i = and <8 x i64> %1, %0 %2 = bitcast <8 x i64> %and.i.i to <8 x double> %3 = bitcast i8 %__U to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W ret <8 x double> %4 } define <8 x double> @test_mm512_maskz_and_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_and_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_and_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> %and.i.i = and <8 x i64> %1, %0 %2 = bitcast <8 x i64> %and.i.i to <8 x double> %3 = bitcast i8 %__U to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer ret <8 x double> %4 } define <16 x float> @test_mm512_mask_and_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_and_ps: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_and_ps: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> %and.i.i = and <16 x i32> %1, %0 %2 = bitcast <16 x i32> %and.i.i to <16 x float> %3 = bitcast i16 %__U to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W ret <16 x float> %4 } define <16 x float> @test_mm512_maskz_and_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_and_ps: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_and_ps: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> %and.i.i = and <16 x i32> %1, %0 %2 = bitcast <16 x i32> %and.i.i to <16 x float> %3 = bitcast i16 %__U to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer ret <16 x float> %4 } define <8 x double> @test_mm512_mask_andnot_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_mask_andnot_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_andnot_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %neg.i.i = xor <8 x i64> %0, %1 = bitcast <8 x double> %__B to <8 x i64> %and.i.i = and <8 x i64> %1, %neg.i.i %2 = bitcast <8 x i64> %and.i.i to <8 x double> %3 = bitcast i8 %__U to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W ret <8 x double> %4 } define <8 x double> @test_mm512_maskz_andnot_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { ; GENERIC-LABEL: test_mm512_maskz_andnot_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_andnot_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %neg.i.i = xor <8 x i64> %0, %1 = bitcast <8 x double> %__B to <8 x i64> %and.i.i = and <8 x i64> %1, %neg.i.i %2 = bitcast <8 x i64> %and.i.i to <8 x double> %3 = bitcast i8 %__U to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer ret <8 x double> %4 } define <16 x float> @test_mm512_mask_andnot_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_mask_andnot_ps: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_mask_andnot_ps: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %neg.i.i = xor <16 x i32> %0, %1 = bitcast <16 x float> %__B to <16 x i32> %and.i.i = and <16 x i32> %1, %neg.i.i %2 = bitcast <16 x i32> %and.i.i to <16 x float> %3 = bitcast i16 %__U to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W ret <16 x float> %4 } define <16 x float> @test_mm512_maskz_andnot_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { ; GENERIC-LABEL: test_mm512_maskz_andnot_ps: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_maskz_andnot_ps: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] ; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %neg.i.i = xor <16 x i32> %0, %1 = bitcast <16 x float> %__B to <16 x i32> %and.i.i = and <16 x i32> %1, %neg.i.i %2 = bitcast <16 x i32> %and.i.i to <16 x float> %3 = bitcast i16 %__U to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer ret <16 x float> %4 } define i32 @mov_test1(float %x) { ; GENERIC-LABEL: mov_test1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovd %xmm0, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test1: ; SKX: # %bb.0: ; SKX-NEXT: vmovd %xmm0, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast float %x to i32 ret i32 %res } define <4 x i32> @mov_test2(i32 %x) { ; GENERIC-LABEL: mov_test2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test2: ; SKX: # %bb.0: ; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = insertelement <4 x i32>undef, i32 %x, i32 0 ret <4 x i32>%res } define <2 x i64> @mov_test3(i64 %x) { ; GENERIC-LABEL: mov_test3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test3: ; SKX: # %bb.0: ; SKX-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = insertelement <2 x i64>undef, i64 %x, i32 0 ret <2 x i64>%res } define <4 x i32> @mov_test4(i32* %x) { ; GENERIC-LABEL: mov_test4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test4: ; SKX: # %bb.0: ; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %y = load i32, i32* %x %res = insertelement <4 x i32>undef, i32 %y, i32 0 ret <4 x i32>%res } define void @mov_test5(float %x, float* %y) { ; GENERIC-LABEL: mov_test5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovss %xmm0, (%rdi) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test5: ; SKX: # %bb.0: ; SKX-NEXT: vmovss %xmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] store float %x, float* %y, align 4 ret void } define void @mov_test6(double %x, double* %y) { ; GENERIC-LABEL: mov_test6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovsd %xmm0, (%rdi) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test6: ; SKX: # %bb.0: ; SKX-NEXT: vmovsd %xmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] store double %x, double* %y, align 8 ret void } define float @mov_test7(i32* %x) { ; GENERIC-LABEL: mov_test7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test7: ; SKX: # %bb.0: ; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %y = load i32, i32* %x %res = bitcast i32 %y to float ret float %res } define i32 @mov_test8(<4 x i32> %x) { ; GENERIC-LABEL: mov_test8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovd %xmm0, %eax # sched: [2:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test8: ; SKX: # %bb.0: ; SKX-NEXT: vmovd %xmm0, %eax # sched: [2:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = extractelement <4 x i32> %x, i32 0 ret i32 %res } define i64 @mov_test9(<2 x i64> %x) { ; GENERIC-LABEL: mov_test9: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovq %xmm0, %rax # sched: [2:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test9: ; SKX: # %bb.0: ; SKX-NEXT: vmovq %xmm0, %rax # sched: [2:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = extractelement <2 x i64> %x, i32 0 ret i64 %res } define <4 x i32> @mov_test10(i32* %x) { ; GENERIC-LABEL: mov_test10: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test10: ; SKX: # %bb.0: ; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %y = load i32, i32* %x, align 4 %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0 ret <4 x i32>%res } define <4 x float> @mov_test11(float* %x) { ; GENERIC-LABEL: mov_test11: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test11: ; SKX: # %bb.0: ; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %y = load float, float* %x, align 4 %res = insertelement <4 x float>zeroinitializer, float %y, i32 0 ret <4 x float>%res } define <2 x double> @mov_test12(double* %x) { ; GENERIC-LABEL: mov_test12: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test12: ; SKX: # %bb.0: ; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %y = load double, double* %x, align 8 %res = insertelement <2 x double>zeroinitializer, double %y, i32 0 ret <2 x double>%res } define <2 x i64> @mov_test13(i64 %x) { ; GENERIC-LABEL: mov_test13: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test13: ; SKX: # %bb.0: ; SKX-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0 ret <2 x i64>%res } define <4 x i32> @mov_test14(i32 %x) { ; GENERIC-LABEL: mov_test14: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test14: ; SKX: # %bb.0: ; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0 ret <4 x i32>%res } define <4 x i32> @mov_test15(i32* %x) { ; GENERIC-LABEL: mov_test15: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test15: ; SKX: # %bb.0: ; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %y = load i32, i32* %x, align 4 %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0 ret <4 x i32>%res } define <16 x i32> @mov_test16(i8 * %addr) { ; GENERIC-LABEL: mov_test16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test16: ; SKX: # %bb.0: ; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* %res = load <16 x i32>, <16 x i32>* %vaddr, align 1 ret <16 x i32>%res } define <16 x i32> @mov_test17(i8 * %addr) { ; GENERIC-LABEL: mov_test17: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test17: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* %res = load <16 x i32>, <16 x i32>* %vaddr, align 64 ret <16 x i32>%res } define void @mov_test18(i8 * %addr, <8 x i64> %data) { ; GENERIC-LABEL: mov_test18: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test18: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* store <8 x i64>%data, <8 x i64>* %vaddr, align 64 ret void } define void @mov_test19(i8 * %addr, <16 x i32> %data) { ; GENERIC-LABEL: mov_test19: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test19: ; SKX: # %bb.0: ; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* store <16 x i32>%data, <16 x i32>* %vaddr, align 1 ret void } define void @mov_test20(i8 * %addr, <16 x i32> %data) { ; GENERIC-LABEL: mov_test20: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test20: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* store <16 x i32>%data, <16 x i32>* %vaddr, align 64 ret void } define <8 x i64> @mov_test21(i8 * %addr) { ; GENERIC-LABEL: mov_test21: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test21: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* %res = load <8 x i64>, <8 x i64>* %vaddr, align 64 ret <8 x i64>%res } define void @mov_test22(i8 * %addr, <8 x i64> %data) { ; GENERIC-LABEL: mov_test22: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test22: ; SKX: # %bb.0: ; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* store <8 x i64>%data, <8 x i64>* %vaddr, align 1 ret void } define <8 x i64> @mov_test23(i8 * %addr) { ; GENERIC-LABEL: mov_test23: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test23: ; SKX: # %bb.0: ; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* %res = load <8 x i64>, <8 x i64>* %vaddr, align 1 ret <8 x i64>%res } define void @mov_test24(i8 * %addr, <8 x double> %data) { ; GENERIC-LABEL: mov_test24: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test24: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 64 ret void } define <8 x double> @mov_test25(i8 * %addr) { ; GENERIC-LABEL: mov_test25: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test25: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 64 ret <8 x double>%res } define void @mov_test26(i8 * %addr, <16 x float> %data) { ; GENERIC-LABEL: mov_test26: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test26: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x float>* store <16 x float>%data, <16 x float>* %vaddr, align 64 ret void } define <16 x float> @mov_test27(i8 * %addr) { ; GENERIC-LABEL: mov_test27: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test27: ; SKX: # %bb.0: ; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x float>* %res = load <16 x float>, <16 x float>* %vaddr, align 64 ret <16 x float>%res } define void @mov_test28(i8 * %addr, <8 x double> %data) { ; GENERIC-LABEL: mov_test28: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test28: ; SKX: # %bb.0: ; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 1 ret void } define <8 x double> @mov_test29(i8 * %addr) { ; GENERIC-LABEL: mov_test29: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test29: ; SKX: # %bb.0: ; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 1 ret <8 x double>%res } define void @mov_test30(i8 * %addr, <16 x float> %data) { ; GENERIC-LABEL: mov_test30: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test30: ; SKX: # %bb.0: ; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x float>* store <16 x float>%data, <16 x float>* %vaddr, align 1 ret void } define <16 x float> @mov_test31(i8 * %addr) { ; GENERIC-LABEL: mov_test31: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test31: ; SKX: # %bb.0: ; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x float>* %res = load <16 x float>, <16 x float>* %vaddr, align 1 ret <16 x float>%res } define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test32: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old ret <16 x i32>%res } define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test33: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test33: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old ret <16 x i32>%res } define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test34: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test34: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer ret <16 x i32>%res } define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test35: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test35: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer ret <16 x i32>%res } define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test36: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test36: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old ret <8 x i64>%res } define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test37: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test37: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old ret <8 x i64>%res } define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test38: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test38: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer ret <8 x i64>%res } define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test39: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test39: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer ret <8 x i64>%res } define <16 x float> @mov_test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { ; GENERIC-LABEL: mov_test40: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test40: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovaps (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 64 %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old ret <16 x float>%res } define <16 x float> @mov_test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { ; GENERIC-LABEL: mov_test41: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test41: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovups (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 1 %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old ret <16 x float>%res } define <16 x float> @mov_test42(i8 * %addr, <16 x float> %mask1) { ; GENERIC-LABEL: mov_test42: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test42: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 64 %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer ret <16 x float>%res } define <16 x float> @mov_test43(i8 * %addr, <16 x float> %mask1) { ; GENERIC-LABEL: mov_test43: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test43: ; SKX: # %bb.0: ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 1 %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer ret <16 x float>%res } define <8 x double> @mov_test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { ; GENERIC-LABEL: mov_test44: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test44: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovapd (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 64 %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old ret <8 x double>%res } define <8 x double> @mov_test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { ; GENERIC-LABEL: mov_test45: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test45: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovupd (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 1 %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old ret <8 x double>%res } define <8 x double> @mov_test46(i8 * %addr, <8 x double> %mask1) { ; GENERIC-LABEL: mov_test46: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test46: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 64 %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer ret <8 x double>%res } define <8 x double> @mov_test47(i8 * %addr, <8 x double> %mask1) { ; GENERIC-LABEL: mov_test47: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test47: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 1 %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer ret <8 x double>%res } define i16 @mask16(i16 %x) { ; GENERIC-LABEL: mask16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask16: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %ax killed %ax killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, %ret = bitcast <16 x i1> %m1 to i16 ret i16 %ret } define i32 @mask16_zext(i16 %x) { ; GENERIC-LABEL: mask16_zext: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask16_zext: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, %m2 = bitcast <16 x i1> %m1 to i16 %ret = zext i16 %m2 to i32 ret i32 %ret } define i8 @mask8(i8 %x) { ; GENERIC-LABEL: mask8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %al killed %al killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask8: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %al killed %al killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, %ret = bitcast <8 x i1> %m1 to i8 ret i8 %ret } define i32 @mask8_zext(i8 %x) { ; GENERIC-LABEL: mask8_zext: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask8_zext: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovb %k0, %eax # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, %m2 = bitcast <8 x i1> %m1 to i8 %ret = zext i8 %m2 to i32 ret i32 %ret } define void @mask16_mem(i16* %ptr) { ; GENERIC-LABEL: mask16_mem: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovw (%rdi), %k0 ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovw %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask16_mem: ; SKX: # %bb.0: ; SKX-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = load i16, i16* %ptr, align 4 %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, %ret = bitcast <16 x i1> %m1 to i16 store i16 %ret, i16* %ptr, align 4 ret void } define void @mask8_mem(i8* %ptr) { ; GENERIC-LABEL: mask8_mem: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovb (%rdi), %k0 ; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mask8_mem: ; SKX: # %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = load i8, i8* %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, %ret = bitcast <8 x i1> %m1 to i8 store i8 %ret, i8* %ptr, align 4 ret void } define i16 @mand16(i16 %x, i16 %y) { ; GENERIC-LABEL: mand16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33] ; GENERIC-NEXT: xorl %esi, %eax # sched: [1:0.33] ; GENERIC-NEXT: andl %esi, %edi # sched: [1:0.33] ; GENERIC-NEXT: orl %eax, %edi # sched: [1:0.33] ; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mand16: ; SKX: # %bb.0: ; SKX-NEXT: movl %edi, %eax # sched: [1:0.25] ; SKX-NEXT: xorl %esi, %eax # sched: [1:0.25] ; SKX-NEXT: andl %esi, %edi # sched: [1:0.25] ; SKX-NEXT: orl %eax, %edi # sched: [1:0.25] ; SKX-NEXT: movl %edi, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %ma = bitcast i16 %x to <16 x i1> %mb = bitcast i16 %y to <16 x i1> %mc = and <16 x i1> %ma, %mb %md = xor <16 x i1> %ma, %mb %me = or <16 x i1> %mc, %md %ret = bitcast <16 x i1> %me to i16 ret i16 %ret } define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) { ; GENERIC-LABEL: mand16_mem: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovw (%rdi), %k0 ; GENERIC-NEXT: kmovw (%rsi), %k1 ; GENERIC-NEXT: kandw %k1, %k0, %k2 # sched: [1:1.00] ; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: korw %k0, %k2, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mand16_mem: ; SKX: # %bb.0: ; SKX-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: kmovw (%rsi), %k1 # sched: [7:1.00] ; SKX-NEXT: kandw %k1, %k0, %k2 # sched: [1:1.00] ; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: korw %k0, %k2, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %ax killed %ax killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %ma = load <16 x i1>, <16 x i1>* %x %mb = load <16 x i1>, <16 x i1>* %y %mc = and <16 x i1> %ma, %mb %md = xor <16 x i1> %ma, %mb %me = or <16 x i1> %mc, %md %ret = bitcast <16 x i1> %me to i16 ret i16 %ret } define i8 @shuf_test1(i16 %v) nounwind { ; GENERIC-LABEL: shuf_test1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kshiftrw $8, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %al killed %al killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: shuf_test1: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: kshiftrw $8, %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %al killed %al killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %v1 = bitcast i16 %v to <16 x i1> %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> %mask1 = bitcast <8 x i1> %mask to i8 ret i8 %mask1 } define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftrw $5, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_test1: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kshiftrw $5, %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: andl $1, %eax # sched: [1:0.25] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i32 ret i32 %res } define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftrw $5, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_test2: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kshiftrw $5, %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: andl $1, %eax # sched: [1:0.25] ; SKX-NEXT: # kill: def %ax killed %ax killed %eax ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i16 ret i16 %res } define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { ; GENERIC-LABEL: zext_test3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kshiftrw $5, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: andb $1, %al # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %al killed %al killed %eax ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_test3: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kshiftrw $5, %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: andb $1, %al # sched: [1:0.25] ; SKX-NEXT: # kill: def %al killed %al killed %eax ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i8 ret i8 %res } define i8 @conv1(<8 x i1>* %R) { ; GENERIC-LABEL: conv1: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: kxnorw %k0, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: movb $-2, -{{[0-9]+}}(%rsp) # sched: [5:1.00] ; GENERIC-NEXT: movb $-2, %al # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: conv1: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kxnorw %k0, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp) # sched: [1:1.00] ; SKX-NEXT: movb $-2, %al # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] entry: store <8 x i1> , <8 x i1>* %R %maskPtr = alloca <8 x i1> store <8 x i1> , <8 x i1>* %maskPtr %mask = load <8 x i1>, <8 x i1>* %maskPtr %mask_convert = bitcast <8 x i1> %mask to i8 ret i8 %mask_convert } define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) { ; GENERIC-LABEL: test4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: kandnw %k0, %k1, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test4: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # sched: [3:1.00] ; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: kandnw %k0, %k1, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x_gt_y = icmp sgt <4 x i64> %x, %y %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1 %res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1 %resse = sext <4 x i1>%res to <4 x i32> ret <4 x i32> %resse } define <2 x i64> @vcmp_test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) { ; GENERIC-LABEL: vcmp_test5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: kandnw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vcmp_test5: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: kandnw %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %x_gt_y = icmp slt <2 x i64> %x, %y %x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1 %res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1 %resse = sext <2 x i1>%res to <2 x i64> ret <2 x i64> %resse }define void @vcmp_test6(<16 x i1> %mask) { allocas: %a= and <16 x i1> %mask, %b = bitcast <16 x i1> %a to i16 %c = icmp eq i16 %b, 0 br i1 %c, label %true, label %false true: ret void false: ret void } define void @vcmp_test7(<8 x i1> %mask) { ; GENERIC-LABEL: vcmp_test7: ; GENERIC: # %bb.0: # %allocas ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: movb $85, %al # sched: [1:0.33] ; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] ; GENERIC-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: ktestb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vcmp_test7: ; SKX: # %bb.0: # %allocas ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: movb $85, %al # sched: [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] ; SKX-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: ktestb %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] allocas: %a= or <8 x i1> %mask, %b = bitcast <8 x i1> %a to i8 %c = icmp eq i8 %b, 0 br i1 %c, label %true, label %false true: ret void false: ret void } define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; GENERIC-LABEL: vcmp_test8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] ; GENERIC-NEXT: jg .LBB386_1 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.2: ; GENERIC-NEXT: vpcmpltud %zmm2, %zmm1, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB386_1: ; GENERIC-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vcmp_test8: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] ; SKX-NEXT: jg .LBB386_1 # sched: [1:0.50] ; SKX-NEXT: # %bb.2: ; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] ; SKX-NEXT: .LBB386_1: ; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cond = icmp sgt i32 %a1, %b1 %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer %cmp2 = icmp ult <16 x i32> %b, zeroinitializer %mix = select i1 %cond, <16 x i1> %cmp1, <16 x i1> %cmp2 %res = sext <16 x i1> %mix to <16 x i8> ret <16 x i8> %res } define <16 x i1> @vpmov_test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) { ; GENERIC-LABEL: vpmov_test9: ; GENERIC: # %bb.0: ; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] ; GENERIC-NEXT: jg .LBB387_1 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.2: ; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: jmp .LBB387_3 # sched: [1:1.00] ; GENERIC-NEXT: .LBB387_1: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: .LBB387_3: ; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vpmov_test9: ; SKX: # %bb.0: ; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] ; SKX-NEXT: jg .LBB387_1 # sched: [1:0.50] ; SKX-NEXT: # %bb.2: ; SKX-NEXT: vpsllw $7, %xmm1, %xmm0 # sched: [1:0.50] ; SKX-NEXT: jmp .LBB387_3 # sched: [1:0.50] ; SKX-NEXT: .LBB387_1: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: .LBB387_3: ; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <16 x i1>%a, <16 x i1>%b ret <16 x i1>%c }define <8 x i1> @vpmov_test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) { %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <8 x i1>%a, <8 x i1>%b ret <8 x i1>%c } define <4 x i1> @vmov_test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) { ; GENERIC-LABEL: vmov_test11: ; GENERIC: # %bb.0: ; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] ; GENERIC-NEXT: jg .LBB389_1 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.2: ; GENERIC-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: jmp .LBB389_3 # sched: [1:1.00] ; GENERIC-NEXT: .LBB389_1: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: .LBB389_3: ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test11: ; SKX: # %bb.0: ; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] ; SKX-NEXT: jg .LBB389_1 # sched: [1:0.50] ; SKX-NEXT: # %bb.2: ; SKX-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:0.50] ; SKX-NEXT: jmp .LBB389_3 # sched: [1:0.50] ; SKX-NEXT: .LBB389_1: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: .LBB389_3: ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <4 x i1>%a, <4 x i1>%b ret <4 x i1>%c } define i32 @vmov_test12(i32 %x, i32 %y) { ; GENERIC-LABEL: vmov_test12: ; GENERIC: # %bb.0: ; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test12: ; SKX: # %bb.0: ; SKX-NEXT: movl %edi, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 0 %c = select i1 %b, i32 %x, i32 %y ret i32 %c } define i32 @vmov_test13(i32 %x, i32 %y) { ; GENERIC-LABEL: vmov_test13: ; GENERIC: # %bb.0: ; GENERIC-NEXT: movl %esi, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test13: ; SKX: # %bb.0: ; SKX-NEXT: movl %esi, %eax # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 3 %c = select i1 %b, i32 %x, i32 %y ret i32 %c }define <4 x i1> @vmov_test14() { %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 2 %c = insertelement <4 x i1> , i1 %b, i32 1 ret <4 x i1> %c } define <16 x i1> @vmov_test15(i32 %x, i32 %y) { ; GENERIC-LABEL: vmov_test15: ; GENERIC: # %bb.0: ; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] ; GENERIC-NEXT: movw $21845, %ax # imm = 0x5555 ; GENERIC-NEXT: # sched: [1:0.33] ; GENERIC-NEXT: movw $1, %cx # sched: [1:0.33] ; GENERIC-NEXT: cmovgw %ax, %cx # sched: [2:0.67] ; GENERIC-NEXT: kmovd %ecx, %k0 # sched: [1:0.33] ; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test15: ; SKX: # %bb.0: ; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] ; SKX-NEXT: movw $21845, %ax # imm = 0x5555 ; SKX-NEXT: # sched: [1:0.25] ; SKX-NEXT: movw $1, %cx # sched: [1:0.25] ; SKX-NEXT: cmovgw %ax, %cx # sched: [1:0.50] ; SKX-NEXT: kmovd %ecx, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i16 21845 to <16 x i1> %b = bitcast i16 1 to <16 x i1> %mask = icmp sgt i32 %x, %y %c = select i1 %mask, <16 x i1> %a, <16 x i1> %b ret <16 x i1> %c } define <64 x i8> @vmov_test16(i64 %x) { ; ; GENERIC-LABEL: vmov_test16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovq %rdi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: movb $1, %al # sched: [1:0.33] ; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] ; GENERIC-NEXT: kshiftrq $5, %k0, %k2 # sched: [1:1.00] ; GENERIC-NEXT: kxorq %k1, %k2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kshiftlq $63, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kshiftrq $58, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kxorq %k0, %k1, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test16: ; SKX: # %bb.0: ; SKX-NEXT: kmovq %rdi, %k0 # sched: [1:1.00] ; SKX-NEXT: movb $1, %al # sched: [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] ; SKX-NEXT: kshiftrq $5, %k0, %k2 # sched: [3:1.00] ; SKX-NEXT: kxorq %k1, %k2, %k1 # sched: [1:1.00] ; SKX-NEXT: kshiftlq $63, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: kshiftrq $58, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: kxorq %k0, %k1, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i64 %x to <64 x i1> %b = insertelement <64 x i1>%a, i1 true, i32 5 %c = sext <64 x i1>%b to <64 x i8> ret <64 x i8>%c } define <64 x i8> @vmov_test17(i64 %x, i32 %y, i32 %z) { ; ; GENERIC-LABEL: vmov_test17: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovq %rdi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: cmpl %edx, %esi # sched: [1:0.33] ; GENERIC-NEXT: setg %al # sched: [1:0.50] ; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] ; GENERIC-NEXT: kshiftrq $5, %k0, %k2 # sched: [1:1.00] ; GENERIC-NEXT: kxorq %k1, %k2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kshiftlq $63, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kshiftrq $58, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kxorq %k0, %k1, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test17: ; SKX: # %bb.0: ; SKX-NEXT: kmovq %rdi, %k0 # sched: [1:1.00] ; SKX-NEXT: cmpl %edx, %esi # sched: [1:0.25] ; SKX-NEXT: setg %al # sched: [1:0.50] ; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] ; SKX-NEXT: kshiftrq $5, %k0, %k2 # sched: [3:1.00] ; SKX-NEXT: kxorq %k1, %k2, %k1 # sched: [1:1.00] ; SKX-NEXT: kshiftlq $63, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: kshiftrq $58, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: kxorq %k0, %k1, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i64 %x to <64 x i1> %b = icmp sgt i32 %y, %z %c = insertelement <64 x i1>%a, i1 %b, i32 5 %d = sext <64 x i1>%c to <64 x i8> ret <64 x i8>%d } define <8 x i1> @vmov_test18(i8 %a, i16 %y) { ; GENERIC-LABEL: vmov_test18: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: kshiftrw $8, %k1, %k2 # sched: [1:1.00] ; GENERIC-NEXT: kshiftrw $9, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kshiftrb $6, %k0, %k3 # sched: [1:1.00] ; GENERIC-NEXT: kxorb %k1, %k3, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kshiftlb $7, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kshiftrb $1, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kxorb %k0, %k1, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kshiftlb $1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kshiftrb $1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kshiftlb $7, %k2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test18: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] ; SKX-NEXT: kshiftrw $8, %k1, %k2 # sched: [3:1.00] ; SKX-NEXT: kshiftrw $9, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: kshiftrb $6, %k0, %k3 # sched: [3:1.00] ; SKX-NEXT: kxorb %k1, %k3, %k1 # sched: [1:1.00] ; SKX-NEXT: kshiftlb $7, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: kshiftrb $1, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: kxorb %k0, %k1, %k0 # sched: [1:1.00] ; SKX-NEXT: kshiftlb $1, %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: kshiftrb $1, %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: kshiftlb $7, %k2, %k1 # sched: [3:1.00] ; SKX-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %b = bitcast i8 %a to <8 x i1> %b1 = bitcast i16 %y to <16 x i1> %el1 = extractelement <16 x i1>%b1, i32 8 %el2 = extractelement <16 x i1>%b1, i32 9 %c = insertelement <8 x i1>%b, i1 %el1, i32 7 %d = insertelement <8 x i1>%c, i1 %el2, i32 6 ret <8 x i1>%d } define <32 x i16> @vmov_test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone { ; GENERIC-LABEL: vmov_test21: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test21: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define void @vmov_test22(<4 x i1> %a, <4 x i1>* %addr) { ; GENERIC-LABEL: vmov_test22: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test22: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] store <4 x i1> %a, <4 x i1>* %addr ret void } define void @vmov_test23(<2 x i1> %a, <2 x i1>* %addr) { ; GENERIC-LABEL: vmov_test23: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test23: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] store <2 x i1> %a, <2 x i1>* %addr ret void } define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; GENERIC-LABEL: store_v1i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kxnorw %k0, %k0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rsi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v1i1: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: kxnorw %k0, %k0, %k1 # sched: [1:1.00] ; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovb %k0, (%rsi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = xor <1 x i1> %c, store <1 x i1> %x, <1 x i1>* %ptr, align 4 ret void } define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { ; GENERIC-LABEL: store_v2i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v2i1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = xor <2 x i1> %c, store <2 x i1> %x, <2 x i1>* %ptr, align 4 ret void } define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { ; GENERIC-LABEL: store_v4i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v4i1: ; SKX: # %bb.0: ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = xor <4 x i1> %c, store <4 x i1> %x, <4 x i1>* %ptr, align 4 ret void } define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) { ; GENERIC-LABEL: store_v8i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v8i1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = xor <8 x i1> %c, store <8 x i1> %x, <8 x i1>* %ptr, align 4 ret void } define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) { ; GENERIC-LABEL: store_v16i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovw %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_v16i1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %x = xor <16 x i1> %c, store <16 x i1> %x, <16 x i1>* %ptr, align 4 ret void } ;void f2(int); ;void f1(int c) ;{ ; static int v = 0; ; if (v == 0) ; v = 1; ; else ; v = 0; ; f2(v); ;} @f1.v = internal unnamed_addr global i1 false, align 4 define void @f1(i32 %c) { ; GENERIC-LABEL: f1: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: movzbl {{.*}}(%rip), %edi # sched: [5:0.50] ; GENERIC-NEXT: xorl $1, %edi # sched: [1:0.33] ; GENERIC-NEXT: movb %dil, {{.*}}(%rip) # sched: [5:1.00] ; GENERIC-NEXT: jmp f2 # TAILCALL ; ; SKX-LABEL: f1: ; SKX: # %bb.0: # %entry ; SKX-NEXT: movzbl {{.*}}(%rip), %edi # sched: [5:0.50] ; SKX-NEXT: xorl $1, %edi # sched: [1:0.25] ; SKX-NEXT: movb %dil, {{.*}}(%rip) # sched: [1:1.00] ; SKX-NEXT: jmp f2 # TAILCALL entry: %.b1 = load i1, i1* @f1.v, align 4 %not..b1 = xor i1 %.b1, true store i1 %not..b1, i1* @f1.v, align 4 %0 = zext i1 %not..b1 to i32 tail call void @f2(i32 %0) #2 ret void } declare void @f2(i32) #1 define void @store_i16_i1(i16 %x, i1 *%y) { ; GENERIC-LABEL: store_i16_i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33] ; GENERIC-NEXT: movb %dil, (%rsi) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_i16_i1: ; SKX: # %bb.0: ; SKX-NEXT: andl $1, %edi # sched: [1:0.25] ; SKX-NEXT: movb %dil, (%rsi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %c = trunc i16 %x to i1 store i1 %c, i1* %y ret void } define void @store_i8_i1(i8 %x, i1 *%y) { ; GENERIC-LABEL: store_i8_i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33] ; GENERIC-NEXT: movb %dil, (%rsi) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_i8_i1: ; SKX: # %bb.0: ; SKX-NEXT: andl $1, %edi # sched: [1:0.25] ; SKX-NEXT: movb %dil, (%rsi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %c = trunc i8 %x to i1 store i1 %c, i1* %y ret void } define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { ; GENERIC-LABEL: test_build_vec_v32i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: movl $1497715861, %eax # imm = 0x59455495 ; GENERIC-NEXT: # sched: [1:0.33] ; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_build_vec_v32i1: ; SKX: # %bb.0: ; SKX-NEXT: movl $1497715861, %eax # imm = 0x59455495 ; SKX-NEXT: # sched: [1:0.25] ; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %ret = select <32 x i1> , <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; GENERIC-LABEL: test_build_vec_v64i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_build_vec_v64i1: ; SKX: # %bb.0: ; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [8:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %ret = select <64 x i1> , <64 x i8> %x, <64 x i8> zeroinitializer ret <64 x i8> %ret } define void @ktest_1(<8 x double> %in, double * %base) { ; GENERIC-LABEL: ktest_1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovupd (%rdi), %zmm1 # sched: [4:0.50] ; GENERIC-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: ktestb %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: je .LBB410_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %L1 ; GENERIC-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB410_2: # %L2 ; GENERIC-NEXT: vmovapd %zmm0, 8(%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ktest_1: ; SKX: # %bb.0: ; SKX-NEXT: vmovupd (%rdi), %zmm1 # sched: [8:0.50] ; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00] ; SKX-NEXT: ktestb %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: je .LBB410_2 # sched: [1:0.50] ; SKX-NEXT: # %bb.1: # %L1 ; SKX-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] ; SKX-NEXT: .LBB410_2: # %L2 ; SKX-NEXT: vmovapd %zmm0, 8(%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %addr1 = getelementptr double, double * %base, i64 0 %addr2 = getelementptr double, double * %base, i64 1 %vaddr1 = bitcast double* %addr1 to <8 x double>* %vaddr2 = bitcast double* %addr2 to <8 x double>* %val1 = load <8 x double>, <8 x double> *%vaddr1, align 1 %val2 = load <8 x double>, <8 x double> *%vaddr2, align 1 %sel1 = fcmp ogt <8 x double>%in, %val1 %val3 = select <8 x i1> %sel1, <8 x double> %val2, <8 x double> zeroinitializer %sel2 = fcmp olt <8 x double> %in, %val3 %sel3 = and <8 x i1> %sel1, %sel2 %int_sel3 = bitcast <8 x i1> %sel3 to i8 %res = icmp eq i8 %int_sel3, zeroinitializer br i1 %res, label %L2, label %L1 L1: store <8 x double> %in, <8 x double>* %vaddr1 br label %End L2: store <8 x double> %in, <8 x double>* %vaddr2 br label %End End: ret void } define void @ktest_2(<32 x float> %in, float * %base) { ; ; GENERIC-LABEL: ktest_2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vmovups (%rdi), %zmm2 # sched: [4:0.50] ; GENERIC-NEXT: vmovups 64(%rdi), %zmm3 # sched: [4:0.50] ; GENERIC-NEXT: vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00] ; GENERIC-NEXT: kunpckwd %k1, %k2, %k0 # sched: [1:1.00] ; GENERIC-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [4:0.50] ; GENERIC-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [4:0.50] ; GENERIC-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00] ; GENERIC-NEXT: kunpckwd %k1, %k2, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kord %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: ktestd %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: je .LBB411_2 # sched: [1:1.00] ; GENERIC-NEXT: # %bb.1: # %L1 ; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, 64(%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; GENERIC-NEXT: .LBB411_2: # %L2 ; GENERIC-NEXT: vmovaps %zmm0, 4(%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, 68(%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ktest_2: ; SKX: # %bb.0: ; SKX-NEXT: vmovups (%rdi), %zmm2 # sched: [8:0.50] ; SKX-NEXT: vmovups 64(%rdi), %zmm3 # sched: [8:0.50] ; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00] ; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [8:0.50] ; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: kunpckwd %k1, %k2, %k0 # sched: [3:1.00] ; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00] ; SKX-NEXT: kunpckwd %k1, %k2, %k1 # sched: [3:1.00] ; SKX-NEXT: kord %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: ktestd %k0, %k0 # sched: [3:1.00] ; SKX-NEXT: je .LBB411_2 # sched: [1:0.50] ; SKX-NEXT: # %bb.1: # %L1 ; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm1, 64(%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] ; SKX-NEXT: .LBB411_2: # %L2 ; SKX-NEXT: vmovaps %zmm0, 4(%rdi) # sched: [1:1.00] ; SKX-NEXT: vmovaps %zmm1, 68(%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %addr1 = getelementptr float, float * %base, i64 0 %addr2 = getelementptr float, float * %base, i64 1 %vaddr1 = bitcast float* %addr1 to <32 x float>* %vaddr2 = bitcast float* %addr2 to <32 x float>* %val1 = load <32 x float>, <32 x float> *%vaddr1, align 1 %val2 = load <32 x float>, <32 x float> *%vaddr2, align 1 %sel1 = fcmp ogt <32 x float>%in, %val1 %val3 = select <32 x i1> %sel1, <32 x float> %val2, <32 x float> zeroinitializer %sel2 = fcmp olt <32 x float> %in, %val3 %sel3 = or <32 x i1> %sel1, %sel2 %int_sel3 = bitcast <32 x i1> %sel3 to i32 %res = icmp eq i32 %int_sel3, zeroinitializer br i1 %res, label %L2, label %L1 L1: store <32 x float> %in, <32 x float>* %vaddr1 br label %End L2: store <32 x float> %in, <32 x float>* %vaddr2 br label %End End: ret void } define <8 x i64> @load_8i1(<8 x i1>* %a) { ; GENERIC-LABEL: load_8i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovb (%rdi), %k0 ; GENERIC-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: load_8i1: ; SKX: # %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %b = load <8 x i1>, <8 x i1>* %a %c = sext <8 x i1> %b to <8 x i64> ret <8 x i64> %c } define <16 x i32> @load_16i1(<16 x i1>* %a) { ; GENERIC-LABEL: load_16i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovw (%rdi), %k0 ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: load_16i1: ; SKX: # %bb.0: ; SKX-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %b = load <16 x i1>, <16 x i1>* %a %c = sext <16 x i1> %b to <16 x i32> ret <16 x i32> %c } define <2 x i16> @load_2i1(<2 x i1>* %a) { ; GENERIC-LABEL: load_2i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovb (%rdi), %k0 ; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: load_2i1: ; SKX: # %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %b = load <2 x i1>, <2 x i1>* %a %c = sext <2 x i1> %b to <2 x i16> ret <2 x i16> %c } define <4 x i16> @load_4i1(<4 x i1>* %a) { ; GENERIC-LABEL: load_4i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovb (%rdi), %k0 ; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: load_4i1: ; SKX: # %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %b = load <4 x i1>, <4 x i1>* %a %c = sext <4 x i1> %b to <4 x i16> ret <4 x i16> %c } define <32 x i16> @load_32i1(<32 x i1>* %a) { ; GENERIC-LABEL: load_32i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd (%rdi), %k0 ; GENERIC-NEXT: vpmovm2w %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: load_32i1: ; SKX: # %bb.0: ; SKX-NEXT: kmovd (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2w %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %b = load <32 x i1>, <32 x i1>* %a %c = sext <32 x i1> %b to <32 x i16> ret <32 x i16> %c } define <64 x i8> @load_64i1(<64 x i1>* %a) { ; GENERIC-LABEL: load_64i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovq (%rdi), %k0 ; GENERIC-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: load_64i1: ; SKX: # %bb.0: ; SKX-NEXT: kmovq (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %b = load <64 x i1>, <64 x i1>* %a %c = sext <64 x i1> %b to <64 x i8> ret <64 x i8> %c } define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) { ; GENERIC-LABEL: store_8i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_8i1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] store <8 x i1> %v, <8 x i1>* %a ret void } define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) { ; GENERIC-LABEL: store_8i1_1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovb %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_8i1_1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %v1 = trunc <8 x i16> %v to <8 x i1> store <8 x i1> %v1, <8 x i1>* %a ret void } define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) { ; GENERIC-LABEL: store_16i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovw %k0, (%rdi) ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_16i1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] store <16 x i1> %v, <16 x i1>* %a ret void } define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) { ; GENERIC-LABEL: store_32i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: vpmovb2m %ymm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %k0, (%rdi) ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_32i1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %ymm0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] store <32 x i1> %v, <32 x i1>* %a ret void } define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) { ; GENERIC-LABEL: store_32i1_1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $15, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovw2m %zmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %k0, (%rdi) ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_32i1_1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $15, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovw2m %zmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %v1 = trunc <32 x i16> %v to <32 x i1> store <32 x i1> %v1, <32 x i1>* %a ret void } define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; GENERIC-LABEL: store_64i1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpsllw $7, %zmm0, %zmm0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovq %k0, (%rdi) ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: store_64i1: ; SKX: # %bb.0: ; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovq %k0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] store <64 x i1> %v, <64 x i1>* %a ret void } define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { ; GENERIC-LABEL: test_bitcast_v8i1_zext: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kmovb %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_bitcast_v8i1_zext: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovb %k0, %eax # sched: [3:1.00] ; SKX-NEXT: addl %eax, %eax # sched: [1:0.25] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %v1 = icmp eq <16 x i32> %a, zeroinitializer %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> %mask1 = bitcast <8 x i1> %mask to i8 %val = zext i8 %mask1 to i32 %val1 = add i32 %val, %val ret i32 %val1 } define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { ; GENERIC-LABEL: test_bitcast_v16i1_zext: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_bitcast_v16i1_zext: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00] ; SKX-NEXT: addl %eax, %eax # sched: [1:0.25] ; SKX-NEXT: vzeroupper # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %v1 = icmp eq <16 x i32> %a, zeroinitializer %mask1 = bitcast <16 x i1> %v1 to i16 %val = zext i16 %mask1 to i32 %val1 = add i32 %val, %val ret i32 %val1 } define i16 @test_v16i1_add(i16 %x, i16 %y) { ; GENERIC-LABEL: test_v16i1_add: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_v16i1_add: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] ; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %ax killed %ax killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> %m2 = add <16 x i1> %m0, %m1 %ret = bitcast <16 x i1> %m2 to i16 ret i16 %ret } define i16 @test_v16i1_sub(i16 %x, i16 %y) { ; GENERIC-LABEL: test_v16i1_sub: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_v16i1_sub: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] ; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %ax killed %ax killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> %m2 = sub <16 x i1> %m0, %m1 %ret = bitcast <16 x i1> %m2 to i16 ret i16 %ret } define i16 @test_v16i1_mul(i16 %x, i16 %y) { ; GENERIC-LABEL: test_v16i1_mul: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: kandw %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_v16i1_mul: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] ; SKX-NEXT: kandw %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %ax killed %ax killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> %m2 = mul <16 x i1> %m0, %m1 %ret = bitcast <16 x i1> %m2 to i16 ret i16 %ret } define i8 @test_v8i1_add(i8 %x, i8 %y) { ; GENERIC-LABEL: test_v8i1_add: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %al killed %al killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_v8i1_add: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] ; SKX-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %al killed %al killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> %m2 = add <8 x i1> %m0, %m1 %ret = bitcast <8 x i1> %m2 to i8 ret i8 %ret } define i8 @test_v8i1_sub(i8 %x, i8 %y) { ; GENERIC-LABEL: test_v8i1_sub: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %al killed %al killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_v8i1_sub: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] ; SKX-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %al killed %al killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> %m2 = sub <8 x i1> %m0, %m1 %ret = bitcast <8 x i1> %m2 to i8 ret i8 %ret } define i8 @test_v8i1_mul(i8 %x, i8 %y) { ; GENERIC-LABEL: test_v8i1_mul: ; GENERIC: # %bb.0: ; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] ; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] ; GENERIC-NEXT: kandb %k1, %k0, %k0 # sched: [1:1.00] ; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: # kill: def %al killed %al killed %eax ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_v8i1_mul: ; SKX: # %bb.0: ; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] ; SKX-NEXT: kandb %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: def %al killed %al killed %eax ; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> %m2 = mul <8 x i1> %m0, %m1 %ret = bitcast <8 x i1> %m2 to i8 ret i8 %ret } define <16 x i32> @_inreg16xi32(i32 %a) { ; GENERIC-LABEL: _inreg16xi32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastd %edi, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _inreg16xi32: ; SKX: # %bb.0: ; SKX-NEXT: vpbroadcastd %edi, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = insertelement <16 x i32> undef, i32 %a, i32 0 %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %c } define <8 x i64> @_inreg8xi64(i64 %a) { ; GENERIC-LABEL: _inreg8xi64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _inreg8xi64: ; SKX: # %bb.0: ; SKX-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = insertelement <8 x i64> undef, i64 %a, i32 0 %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer ret <8 x i64> %c } define <16 x float> @_ss16xfloat_v4(<4 x float> %a) { ; GENERIC-LABEL: _ss16xfloat_v4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_v4: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b } define <16 x float> @_inreg16xfloat(float %a) { ; GENERIC-LABEL: _inreg16xfloat: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _inreg16xfloat: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %c } define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i ret <16 x float> %r } define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_maskz: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_maskz: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer ret <16 x float> %r } define <16 x float> @_ss16xfloat_load(float* %a.ptr) { ; GENERIC-LABEL: _ss16xfloat_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_load: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastss (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load float, float* %a.ptr %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %c } define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_mask_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_mask_load: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load float, float* %a.ptr %mask = icmp ne <16 x i32> %mask1, zeroinitializer %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i ret <16 x float> %r } define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) { ; GENERIC-LABEL: _ss16xfloat_maskz_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _ss16xfloat_maskz_load: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load float, float* %a.ptr %mask = icmp ne <16 x i32> %mask1, zeroinitializer %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer ret <16 x float> %r } define <8 x double> @_inreg8xdouble(double %a) { ; GENERIC-LABEL: _inreg8xdouble: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _inreg8xdouble: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer ret <8 x double> %c } define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_mask: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_mask: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00] ; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i ret <8 x double> %r } define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_maskz: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_maskz: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer ret <8 x double> %r } define <8 x double> @_sd8xdouble_load(double* %a.ptr) { ; GENERIC-LABEL: _sd8xdouble_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_load: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load double, double* %a.ptr %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer ret <8 x double> %c } define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_mask_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_mask_load: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load double, double* %a.ptr %mask = icmp ne <8 x i32> %mask1, zeroinitializer %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i ret <8 x double> %r } define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) { ; GENERIC-LABEL: _sd8xdouble_maskz_load: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _sd8xdouble_maskz_load: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00] ; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %a = load double, double* %a.ptr %mask = icmp ne <8 x i32> %mask1, zeroinitializer %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer ret <8 x double> %r } define <16 x i32> @_xmm16xi32(<16 x i32> %a) { ; GENERIC-LABEL: _xmm16xi32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _xmm16xi32: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %b } define <16 x float> @_xmm16xfloat(<16 x float> %a) { ; GENERIC-LABEL: _xmm16xfloat: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _xmm16xfloat: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b } define <16 x i32> @test_vbroadcast() { ; GENERIC-LABEL: test_vbroadcast: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00] ; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33] ; GENERIC-NEXT: knotw %k0, %k1 # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_vbroadcast: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.33] ; SKX-NEXT: vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25] ; SKX-NEXT: knotw %k0, %k1 # sched: [1:1.00] ; SKX-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = sext <16 x i1> zeroinitializer to <16 x i32> %1 = fcmp uno <16 x float> undef, zeroinitializer %2 = sext <16 x i1> %1 to <16 x i32> %3 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> %2 ret <16 x i32> %3 } ; We implement the set1 intrinsics with vector initializers. Verify that the ; IR generated will produce broadcasts at the end. define <8 x double> @test_set1_pd(double %d) #2 { ; GENERIC-LABEL: test_set1_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_set1_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %vecinit.i = insertelement <8 x double> undef, double %d, i32 0 %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1 %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %d, i32 2 %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %d, i32 3 %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %d, i32 4 %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %d, i32 5 %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %d, i32 6 %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %d, i32 7 ret <8 x double> %vecinit7.i } define <8 x i64> @test_set1_epi64(i64 %d) #2 { ; GENERIC-LABEL: test_set1_epi64: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_set1_epi64: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0 %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1 %vecinit2.i = insertelement <8 x i64> %vecinit1.i, i64 %d, i32 2 %vecinit3.i = insertelement <8 x i64> %vecinit2.i, i64 %d, i32 3 %vecinit4.i = insertelement <8 x i64> %vecinit3.i, i64 %d, i32 4 %vecinit5.i = insertelement <8 x i64> %vecinit4.i, i64 %d, i32 5 %vecinit6.i = insertelement <8 x i64> %vecinit5.i, i64 %d, i32 6 %vecinit7.i = insertelement <8 x i64> %vecinit6.i, i64 %d, i32 7 ret <8 x i64> %vecinit7.i } define <16 x float> @test_set1_ps(float %f) #2 { ; GENERIC-LABEL: test_set1_ps: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_set1_ps: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %vecinit.i = insertelement <16 x float> undef, float %f, i32 0 %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1 %vecinit2.i = insertelement <16 x float> %vecinit1.i, float %f, i32 2 %vecinit3.i = insertelement <16 x float> %vecinit2.i, float %f, i32 3 %vecinit4.i = insertelement <16 x float> %vecinit3.i, float %f, i32 4 %vecinit5.i = insertelement <16 x float> %vecinit4.i, float %f, i32 5 %vecinit6.i = insertelement <16 x float> %vecinit5.i, float %f, i32 6 %vecinit7.i = insertelement <16 x float> %vecinit6.i, float %f, i32 7 %vecinit8.i = insertelement <16 x float> %vecinit7.i, float %f, i32 8 %vecinit9.i = insertelement <16 x float> %vecinit8.i, float %f, i32 9 %vecinit10.i = insertelement <16 x float> %vecinit9.i, float %f, i32 10 %vecinit11.i = insertelement <16 x float> %vecinit10.i, float %f, i32 11 %vecinit12.i = insertelement <16 x float> %vecinit11.i, float %f, i32 12 %vecinit13.i = insertelement <16 x float> %vecinit12.i, float %f, i32 13 %vecinit14.i = insertelement <16 x float> %vecinit13.i, float %f, i32 14 %vecinit15.i = insertelement <16 x float> %vecinit14.i, float %f, i32 15 ret <16 x float> %vecinit15.i } define <16 x i32> @test_set1_epi32(i32 %f) #2 { ; GENERIC-LABEL: test_set1_epi32: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vpbroadcastd %edi, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_set1_epi32: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vpbroadcastd %edi, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0 %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1 %vecinit2.i = insertelement <16 x i32> %vecinit1.i, i32 %f, i32 2 %vecinit3.i = insertelement <16 x i32> %vecinit2.i, i32 %f, i32 3 %vecinit4.i = insertelement <16 x i32> %vecinit3.i, i32 %f, i32 4 %vecinit5.i = insertelement <16 x i32> %vecinit4.i, i32 %f, i32 5 %vecinit6.i = insertelement <16 x i32> %vecinit5.i, i32 %f, i32 6 %vecinit7.i = insertelement <16 x i32> %vecinit6.i, i32 %f, i32 7 %vecinit8.i = insertelement <16 x i32> %vecinit7.i, i32 %f, i32 8 %vecinit9.i = insertelement <16 x i32> %vecinit8.i, i32 %f, i32 9 %vecinit10.i = insertelement <16 x i32> %vecinit9.i, i32 %f, i32 10 %vecinit11.i = insertelement <16 x i32> %vecinit10.i, i32 %f, i32 11 %vecinit12.i = insertelement <16 x i32> %vecinit11.i, i32 %f, i32 12 %vecinit13.i = insertelement <16 x i32> %vecinit12.i, i32 %f, i32 13 %vecinit14.i = insertelement <16 x i32> %vecinit13.i, i32 %f, i32 14 %vecinit15.i = insertelement <16 x i32> %vecinit14.i, i32 %f, i32 15 ret <16 x i32> %vecinit15.i } ; We implement the scalar broadcast intrinsics with vector initializers. ; Verify that the IR generated will produce the broadcast at the end. define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) { ; GENERIC-LABEL: test_mm512_broadcastsd_pd: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_mm512_broadcastsd_pd: ; SKX: # %bb.0: # %entry ; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = extractelement <2 x double> %a, i32 0 %vecinit.i = insertelement <8 x double> undef, double %0, i32 0 %vecinit1.i = insertelement <8 x double> %vecinit.i, double %0, i32 1 %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %0, i32 2 %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %0, i32 3 %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %0, i32 4 %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %0, i32 5 %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %0, i32 6 %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %0, i32 7 ret <8 x double> %vecinit7.i } define <16 x float> @suff_test1(<8 x float>%a) { ; GENERIC-LABEL: suff_test1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: suff_test1: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> zeroinitializer ret <16 x float>%res } define <8 x double> @suff_test2(<4 x double>%a) { ; GENERIC-LABEL: suff_test2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: suff_test2: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> zeroinitializer ret <8 x double>%res } define <64 x i8> @_invec32xi8(<32 x i8>%a) { ; GENERIC-LABEL: _invec32xi8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastb %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _invec32xi8: ; SKX: # %bb.0: ; SKX-NEXT: vpbroadcastb %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <32 x i8> %a, <32 x i8> undef, <64 x i32> zeroinitializer ret <64 x i8>%res } define <32 x i16> @_invec16xi16(<16 x i16>%a) { ; GENERIC-LABEL: _invec16xi16: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastw %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _invec16xi16: ; SKX: # %bb.0: ; SKX-NEXT: vpbroadcastw %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i16> %a, <16 x i16> undef, <32 x i32> zeroinitializer ret <32 x i16>%res } define <16 x i32> @_invec8xi32(<8 x i32>%a) { ; GENERIC-LABEL: _invec8xi32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _invec8xi32: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32>%res } define <8 x i64> @_invec4xi64(<4 x i64>%a) { ; GENERIC-LABEL: _invec4xi64: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: _invec4xi64: ; SKX: # %bb.0: ; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %a, <4 x i64> undef, <8 x i32> zeroinitializer ret <8 x i64>%res } declare void @func_f32(float) define <16 x float> @broadcast_ss_spill(float %x) { ; GENERIC-LABEL: broadcast_ss_spill: ; GENERIC: # %bb.0: ; GENERIC-NEXT: subq $24, %rsp # sched: [1:0.33] ; GENERIC-NEXT: .cfi_def_cfa_offset 32 ; GENERIC-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [5:1.00] ; GENERIC-NEXT: callq func_f32 ; GENERIC-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [5:1.00] ; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: broadcast_ss_spill: ; SKX: # %bb.0: ; SKX-NEXT: subq $24, %rsp # sched: [1:0.25] ; SKX-NEXT: .cfi_def_cfa_offset 32 ; SKX-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00] ; SKX-NEXT: callq func_f32 ; SKX-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50] ; SKX-NEXT: addq $24, %rsp # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %a = fadd float %x, %x call void @func_f32(float %a) %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %c } declare void @func_f64(double) define <8 x double> @broadcast_sd_spill(double %x) { ; GENERIC-LABEL: broadcast_sd_spill: ; GENERIC: # %bb.0: ; GENERIC-NEXT: subq $24, %rsp # sched: [1:0.33] ; GENERIC-NEXT: .cfi_def_cfa_offset 32 ; GENERIC-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [5:1.00] ; GENERIC-NEXT: callq func_f64 ; GENERIC-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [5:1.00] ; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: broadcast_sd_spill: ; SKX: # %bb.0: ; SKX-NEXT: subq $24, %rsp # sched: [1:0.25] ; SKX-NEXT: .cfi_def_cfa_offset 32 ; SKX-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00] ; SKX-NEXT: callq func_f64 ; SKX-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50] ; SKX-NEXT: addq $24, %rsp # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %a = fadd double %x, %x call void @func_f64(double %a) %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer ret <8 x double> %c } Index: vendor/llvm/dist/test/CodeGen/X86/pr34080-2.ll =================================================================== --- vendor/llvm/dist/test/CodeGen/X86/pr34080-2.ll (nonexistent) +++ vendor/llvm/dist/test/CodeGen/X86/pr34080-2.ll (revision 327152) @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-openbsd6.2 | FileCheck %s + +%struct.DateTime = type { i64, i32, i32, i32, i32, i32, double, i8 } + +define void @computeJD(%struct.DateTime*) nounwind { +; CHECK-LABEL: computeJD: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $32, %esp +; CHECK-NEXT: movl 8(%ebp), %ebx +; CHECK-NEXT: movl 8(%ebx), %esi +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $3, 12(%ebx) +; CHECK-NEXT: setl %al +; CHECK-NEXT: subl %eax, %esi +; CHECK-NEXT: movl $-1374389535, %ecx # imm = 0xAE147AE1 +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: imull %ecx +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $5, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: imull %edx +; CHECK-NEXT: movl %edx, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $7, %edi +; CHECK-NEXT: addl %eax, %edi +; CHECK-NEXT: imull $36525, %esi, %eax # imm = 0x8EAD +; CHECK-NEXT: addl $172251900, %eax # imm = 0xA445AFC +; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; CHECK-NEXT: imull %edx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $5, %edx +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: addl 16(%ebx), %ecx +; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: leal 257(%ecx,%edx), %eax +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fildl {{[0-9]+}}(%esp) +; CHECK-NEXT: fadds {{\.LCPI.*}} +; CHECK-NEXT: fmuls {{\.LCPI.*}} +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movw $3199, {{[0-9]+}}(%esp) # imm = 0xC7F +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb $1, 36(%ebx) +; CHECK-NEXT: imull $3600000, 20(%ebx), %eax # imm = 0x36EE80 +; CHECK-NEXT: imull $60000, 24(%ebx), %ecx # imm = 0xEA60 +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: fldl 28(%ebx) +; CHECK-NEXT: fmuls {{\.LCPI.*}} +; CHECK-NEXT: fnstcw (%esp) +; CHECK-NEXT: movzwl (%esp), %eax +; CHECK-NEXT: movw $3199, (%esp) # imm = 0xC7F +; CHECK-NEXT: fldcw (%esp) +; CHECK-NEXT: movw %ax, (%esp) +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: sarl $31, %eax +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw (%esp) +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ecx, (%ebx) +; CHECK-NEXT: movl %eax, 4(%ebx) +; CHECK-NEXT: leal -12(%ebp), %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl + %2 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 7 + %3 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 1 + %4 = load i32, i32* %3, align 4 + %5 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 2 + %6 = load i32, i32* %5, align 4 + %7 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 3 + %8 = load i32, i32* %7, align 4 + %9 = icmp slt i32 %6, 3 + %10 = add i32 %6, 12 + %11 = select i1 %9, i32 %10, i32 %6 + %12 = sext i1 %9 to i32 + %13 = add i32 %4, %12 + %14 = sdiv i32 %13, -100 + %15 = sdiv i32 %13, 400 + %16 = mul i32 %13, 36525 + %17 = add i32 %16, 172251900 + %18 = sdiv i32 %17, 100 + %19 = mul i32 %11, 306001 + %20 = add i32 %19, 306001 + %21 = sdiv i32 %20, 10000 + %22 = add i32 %8, 2 + %23 = add i32 %22, %14 + %24 = add i32 %23, %15 + %25 = add i32 %24, 255 + %26 = add i32 %25, %18 + %27 = sitofp i32 %26 to double + %28 = fadd double %27, -1.524500e+03 + %29 = fmul double %28, 8.640000e+07 + %30 = fptosi double %29 to i64 + %31 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 0 + store i8 1, i8* %2, align 4 + %32 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 4 + %33 = load i32, i32* %32, align 4 + %34 = mul i32 %33, 3600000 + %35 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 5 + %36 = load i32, i32* %35, align 4 + %37 = mul i32 %36, 60000 + %38 = add i32 %37, %34 + %39 = sext i32 %38 to i64 + %40 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 6 + %41 = load double, double* %40, align 4 + %42 = fmul double %41, 1.000000e+03 + %43 = fptosi double %42 to i64 + %44 = add i64 %39, %43 + %45 = add i64 %44, %30 + store i64 %45, i64* %31, align 4 + ret void +} + +attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: vendor/llvm/dist/test/CodeGen/X86/pr34080.ll =================================================================== --- vendor/llvm/dist/test/CodeGen/X86/pr34080.ll (revision 327151) +++ vendor/llvm/dist/test/CodeGen/X86/pr34080.ll (revision 327152) @@ -1,167 +1,167 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2-BROKEN +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2-SCHEDULE ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse3 | FileCheck %s --check-prefix=SSE3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse3 -mcpu=prescott | FileCheck %s --check-prefix=SSE3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -mcpu=sandybridge | FileCheck %s --check-prefix=AVX define void @_Z1fe(x86_fp80 %z) local_unnamed_addr #0 { ; SSE2-LABEL: _Z1fe: ; SSE2: ## %bb.0: ## %entry ; SSE2-NEXT: pushq %rbp ; SSE2-NEXT: .cfi_def_cfa_offset 16 ; SSE2-NEXT: .cfi_offset %rbp, -16 ; SSE2-NEXT: movq %rsp, %rbp ; SSE2-NEXT: .cfi_def_cfa_register %rbp ; SSE2-NEXT: fldt 16(%rbp) ; SSE2-NEXT: fnstcw -4(%rbp) ; SSE2-NEXT: movzwl -4(%rbp), %eax ; SSE2-NEXT: movw $3199, -4(%rbp) ## imm = 0xC7F ; SSE2-NEXT: fldcw -4(%rbp) ; SSE2-NEXT: movw %ax, -4(%rbp) ; SSE2-NEXT: fistl -8(%rbp) ; SSE2-NEXT: fldcw -4(%rbp) ; SSE2-NEXT: cvtsi2sdl -8(%rbp), %xmm0 ; SSE2-NEXT: movsd %xmm0, -64(%rbp) ; SSE2-NEXT: movsd %xmm0, -32(%rbp) ; SSE2-NEXT: fsubl -32(%rbp) ; SSE2-NEXT: flds {{.*}}(%rip) ; SSE2-NEXT: fmul %st(0), %st(1) ; SSE2-NEXT: fnstcw -2(%rbp) ; SSE2-NEXT: movzwl -2(%rbp), %eax ; SSE2-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F ; SSE2-NEXT: fldcw -2(%rbp) ; SSE2-NEXT: movw %ax, -2(%rbp) ; SSE2-NEXT: fxch %st(1) ; SSE2-NEXT: fistl -12(%rbp) ; SSE2-NEXT: fldcw -2(%rbp) ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sdl -12(%rbp), %xmm0 ; SSE2-NEXT: movsd %xmm0, -56(%rbp) ; SSE2-NEXT: movsd %xmm0, -24(%rbp) ; SSE2-NEXT: fsubl -24(%rbp) ; SSE2-NEXT: fmulp %st(1) ; SSE2-NEXT: fstpl -48(%rbp) ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; SSE2-BROKEN-LABEL: _Z1fe: -; SSE2-BROKEN: ## %bb.0: ## %entry -; SSE2-BROKEN-NEXT: pushq %rbp -; SSE2-BROKEN-NEXT: .cfi_def_cfa_offset 16 -; SSE2-BROKEN-NEXT: .cfi_offset %rbp, -16 -; SSE2-BROKEN-NEXT: movq %rsp, %rbp -; SSE2-BROKEN-NEXT: .cfi_def_cfa_register %rbp -; SSE2-BROKEN-NEXT: fnstcw -4(%rbp) -; SSE2-BROKEN-NEXT: fldt 16(%rbp) -; SSE2-BROKEN-NEXT: movzwl -4(%rbp), %eax -; SSE2-BROKEN-NEXT: movw $3199, -4(%rbp) ## imm = 0xC7F -; SSE2-BROKEN-NEXT: fldcw -4(%rbp) -; SSE2-BROKEN-NEXT: movw %ax, -4(%rbp) -; SSE2-BROKEN-NEXT: fistl -8(%rbp) -; SSE2-BROKEN-NEXT: fldcw -4(%rbp) -; SSE2-BROKEN-NEXT: cvtsi2sdl -8(%rbp), %xmm0 -; SSE2-BROKEN-NEXT: movsd %xmm0, -64(%rbp) -; SSE2-BROKEN-NEXT: movsd %xmm0, -32(%rbp) -; SSE2-BROKEN-NEXT: fsubl -32(%rbp) -; SSE2-BROKEN-NEXT: flds {{.*}}(%rip) -; SSE2-BROKEN-NEXT: fnstcw -2(%rbp) -; SSE2-BROKEN-NEXT: fmul %st(0), %st(1) -; SSE2-BROKEN-NEXT: movzwl -2(%rbp), %eax -; SSE2-BROKEN-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F -; SSE2-BROKEN-NEXT: fldcw -2(%rbp) -; SSE2-BROKEN-NEXT: movw %ax, -2(%rbp) -; SSE2-BROKEN-NEXT: fxch %st(1) -; SSE2-BROKEN-NEXT: fistl -12(%rbp) -; SSE2-BROKEN-NEXT: fldcw -2(%rbp) -; SSE2-BROKEN-NEXT: xorps %xmm0, %xmm0 -; SSE2-BROKEN-NEXT: cvtsi2sdl -12(%rbp), %xmm0 -; SSE2-BROKEN-NEXT: movsd %xmm0, -56(%rbp) -; SSE2-BROKEN-NEXT: movsd %xmm0, -24(%rbp) -; SSE2-BROKEN-NEXT: fsubl -24(%rbp) -; SSE2-BROKEN-NEXT: fmulp %st(1) -; SSE2-BROKEN-NEXT: fstpl -48(%rbp) -; SSE2-BROKEN-NEXT: popq %rbp -; SSE2-BROKEN-NEXT: retq +; SSE2-SCHEDULE-LABEL: _Z1fe: +; SSE2-SCHEDULE: ## %bb.0: ## %entry +; SSE2-SCHEDULE-NEXT: pushq %rbp +; SSE2-SCHEDULE-NEXT: .cfi_def_cfa_offset 16 +; SSE2-SCHEDULE-NEXT: .cfi_offset %rbp, -16 +; SSE2-SCHEDULE-NEXT: movq %rsp, %rbp +; SSE2-SCHEDULE-NEXT: .cfi_def_cfa_register %rbp +; SSE2-SCHEDULE-NEXT: fnstcw -4(%rbp) +; SSE2-SCHEDULE-NEXT: fldt 16(%rbp) +; SSE2-SCHEDULE-NEXT: movzwl -4(%rbp), %eax +; SSE2-SCHEDULE-NEXT: movw $3199, -4(%rbp) ## imm = 0xC7F +; SSE2-SCHEDULE-NEXT: fldcw -4(%rbp) +; SSE2-SCHEDULE-NEXT: movw %ax, -4(%rbp) +; SSE2-SCHEDULE-NEXT: fistl -8(%rbp) +; SSE2-SCHEDULE-NEXT: fldcw -4(%rbp) +; SSE2-SCHEDULE-NEXT: cvtsi2sdl -8(%rbp), %xmm0 +; SSE2-SCHEDULE-NEXT: movsd %xmm0, -64(%rbp) +; SSE2-SCHEDULE-NEXT: movsd %xmm0, -32(%rbp) +; SSE2-SCHEDULE-NEXT: fsubl -32(%rbp) +; SSE2-SCHEDULE-NEXT: flds {{.*}}(%rip) +; SSE2-SCHEDULE-NEXT: fnstcw -2(%rbp) +; SSE2-SCHEDULE-NEXT: fmul %st(0), %st(1) +; SSE2-SCHEDULE-NEXT: movzwl -2(%rbp), %eax +; SSE2-SCHEDULE-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F +; SSE2-SCHEDULE-NEXT: fldcw -2(%rbp) +; SSE2-SCHEDULE-NEXT: movw %ax, -2(%rbp) +; SSE2-SCHEDULE-NEXT: fxch %st(1) +; SSE2-SCHEDULE-NEXT: fistl -12(%rbp) +; SSE2-SCHEDULE-NEXT: fldcw -2(%rbp) +; SSE2-SCHEDULE-NEXT: xorps %xmm0, %xmm0 +; SSE2-SCHEDULE-NEXT: cvtsi2sdl -12(%rbp), %xmm0 +; SSE2-SCHEDULE-NEXT: movsd %xmm0, -56(%rbp) +; SSE2-SCHEDULE-NEXT: movsd %xmm0, -24(%rbp) +; SSE2-SCHEDULE-NEXT: fsubl -24(%rbp) +; SSE2-SCHEDULE-NEXT: fmulp %st(1) +; SSE2-SCHEDULE-NEXT: fstpl -48(%rbp) +; SSE2-SCHEDULE-NEXT: popq %rbp +; SSE2-SCHEDULE-NEXT: retq ; ; SSE3-LABEL: _Z1fe: ; SSE3: ## %bb.0: ## %entry ; SSE3-NEXT: pushq %rbp ; SSE3-NEXT: .cfi_def_cfa_offset 16 ; SSE3-NEXT: .cfi_offset %rbp, -16 ; SSE3-NEXT: movq %rsp, %rbp ; SSE3-NEXT: .cfi_def_cfa_register %rbp ; SSE3-NEXT: fldt 16(%rbp) ; SSE3-NEXT: fld %st(0) ; SSE3-NEXT: fisttpl -4(%rbp) ; SSE3-NEXT: cvtsi2sdl -4(%rbp), %xmm0 ; SSE3-NEXT: movsd %xmm0, -48(%rbp) ; SSE3-NEXT: movsd %xmm0, -24(%rbp) ; SSE3-NEXT: fsubl -24(%rbp) ; SSE3-NEXT: flds {{.*}}(%rip) ; SSE3-NEXT: fmul %st(0), %st(1) ; SSE3-NEXT: fld %st(1) ; SSE3-NEXT: fisttpl -8(%rbp) ; SSE3-NEXT: xorps %xmm0, %xmm0 ; SSE3-NEXT: cvtsi2sdl -8(%rbp), %xmm0 ; SSE3-NEXT: movsd %xmm0, -40(%rbp) ; SSE3-NEXT: movsd %xmm0, -16(%rbp) ; SSE3-NEXT: fxch %st(1) ; SSE3-NEXT: fsubl -16(%rbp) ; SSE3-NEXT: fmulp %st(1) ; SSE3-NEXT: fstpl -32(%rbp) ; SSE3-NEXT: popq %rbp ; SSE3-NEXT: retq ; ; AVX-LABEL: _Z1fe: ; AVX: ## %bb.0: ## %entry ; AVX-NEXT: pushq %rbp ; AVX-NEXT: .cfi_def_cfa_offset 16 ; AVX-NEXT: .cfi_offset %rbp, -16 ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: .cfi_def_cfa_register %rbp ; AVX-NEXT: fldt 16(%rbp) ; AVX-NEXT: fld %st(0) ; AVX-NEXT: fisttpl -4(%rbp) ; AVX-NEXT: vcvtsi2sdl -4(%rbp), %xmm0, %xmm0 ; AVX-NEXT: vmovsd %xmm0, -48(%rbp) ; AVX-NEXT: vmovsd %xmm0, -24(%rbp) ; AVX-NEXT: fsubl -24(%rbp) ; AVX-NEXT: flds {{.*}}(%rip) ; AVX-NEXT: fmul %st(0), %st(1) ; AVX-NEXT: fld %st(1) ; AVX-NEXT: fisttpl -8(%rbp) ; AVX-NEXT: vcvtsi2sdl -8(%rbp), %xmm1, %xmm0 ; AVX-NEXT: vmovsd %xmm0, -40(%rbp) ; AVX-NEXT: vmovsd %xmm0, -16(%rbp) ; AVX-NEXT: fxch %st(1) ; AVX-NEXT: fsubl -16(%rbp) ; AVX-NEXT: fmulp %st(1) ; AVX-NEXT: fstpl -32(%rbp) ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq entry: %tx = alloca [3 x double], align 16 %0 = bitcast [3 x double]* %tx to i8* %conv = fptosi x86_fp80 %z to i32 %conv1 = sitofp i32 %conv to double %arrayidx = getelementptr inbounds [3 x double], [3 x double]* %tx, i64 0, i64 0 store double %conv1, double* %arrayidx, align 16 %conv4 = fpext double %conv1 to x86_fp80 %sub = fsub x86_fp80 %z, %conv4 %mul = fmul x86_fp80 %sub, 0xK40178000000000000000 %conv.1 = fptosi x86_fp80 %mul to i32 %conv1.1 = sitofp i32 %conv.1 to double %arrayidx.1 = getelementptr inbounds [3 x double], [3 x double]* %tx, i64 0, i64 1 store double %conv1.1, double* %arrayidx.1, align 8 %conv4.1 = fpext double %conv1.1 to x86_fp80 %sub.1 = fsub x86_fp80 %mul, %conv4.1 %mul.1 = fmul x86_fp80 %sub.1, 0xK40178000000000000000 %conv5 = fptrunc x86_fp80 %mul.1 to double %arrayidx6 = getelementptr inbounds [3 x double], [3 x double]* %tx, i64 0, i64 2 store double %conv5, double* %arrayidx6, align 16 ret void } attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: vendor/llvm/dist/test/CodeGen/X86/shrink_vmul.ll =================================================================== --- vendor/llvm/dist/test/CodeGen/X86/shrink_vmul.ll (revision 327151) +++ vendor/llvm/dist/test/CodeGen/X86/shrink_vmul.ll (revision 327152) @@ -1,1456 +1,2573 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 @c = external global i32*, align 8 ; %val1 = load <2 x i8> ; %op1 = zext<2 x i32> %val1 ; %val2 = load <2 x i8> ; %op2 = zext<2 x i32> %val2 ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movzwl (%edx,%ecx), %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movzwl (%eax,%ecx), %eax -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i8>* %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <2 x i8>* %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val1 = load <4 x i8> ; %op1 = zext<4 x i32> %val1 ; %val2 = load <4 x i8> ; %op2 = zext<4 x i32> %val2 ; %rst = mul <4 x i32> %op1, %op2 ; define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_4xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_4xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_4xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_4xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <4 x i8>* %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 %tmp8 = zext <4 x i8> %wide.load to <4 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <4 x i8>* %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <4 x i32>* store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 ret void } ; %val1 = load <8 x i8> ; %op1 = zext<8 x i32> %val1 ; %val2 = load <8 x i8> ; %op2 = zext<8 x i32> %val2 ; %rst = mul <8 x i32> %op1, %op2 ; define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_8xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_8xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_8xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_8xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <8 x i8>* %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 %tmp8 = zext <8 x i8> %wide.load to <8 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <8 x i8>* %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <8 x i32>* store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 ret void } ; %val1 = load <16 x i8> ; %op1 = zext<16 x i32> %val1 ; %val2 = load <16 x i8> ; %op2 = zext<16 x i32> %val2 ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_16xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu (%eax,%ecx), %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X86-NEXT: movdqa %xmm1, %xmm4 -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-NEXT: pmullw %xmm3, %xmm4 -; X86-NEXT: movdqa %xmm4, %xmm3 -; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X86-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_16xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X86-SSE-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm3, %xmm4 +; X86-SSE-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_16xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm1 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-NEXT: pmullw %xmm3, %xmm4 -; X64-NEXT: movdqa %xmm4, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X64-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm3, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_16xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X64-SSE-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm3, %xmm4 +; X64-SSE-NEXT: movdqa %xmm4, %xmm3 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <16 x i8>* %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 %tmp8 = zext <16 x i8> %wide.load to <16 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <16 x i8>* %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <16 x i32>* store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 ret void } ; %val1 = load <2 x i16> ; %op1 = zext<2 x i32> %val1 ; %val2 = load <2 x i16> ; %op2 = zext<2 x i32> %val2 ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhuw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhuw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i16>* %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <2 x i16>* %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val1 = load <4 x i16> ; %op1 = zext<4 x i32> %val1 ; %val2 = load <4 x i16> ; %op2 = zext<4 x i32> %val2 ; %rst = mul <4 x i32> %op1, %op2 ; define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_4xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhuw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_4xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_4xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhuw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_4xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <4 x i16>* %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 %tmp8 = zext <4 x i16> %wide.load to <4 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <4 x i16>* %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <4 x i32>* store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 ret void } ; %val1 = load <8 x i16> ; %op1 = zext<8 x i32> %val1 ; %val2 = load <8 x i16> ; %op2 = zext<8 x i32> %val2 ; %rst = mul <8 x i32> %op1, %op2 ; define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_8xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu (%eax,%ecx), %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhuw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_8xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_8xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhuw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_8xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <8 x i16>* %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 %tmp8 = zext <8 x i16> %wide.load to <8 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <8 x i16>* %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <8 x i32>* store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 ret void } ; %val1 = load <16 x i16> ; %op1 = zext<16 x i32> %val1 ; %val2 = load <16 x i16> ; %op2 = zext<16 x i32> %val2 ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_16xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: pmulhuw %xmm0, %xmm4 -; X86-NEXT: pmullw %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-NEXT: movdqa %xmm3, %xmm4 -; X86-NEXT: pmulhuw %xmm1, %xmm4 -; X86-NEXT: pmullw %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm3, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_16xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_16xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm2 -; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm4 -; X64-NEXT: pmulhuw %xmm0, %xmm4 -; X64-NEXT: pmullw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X64-NEXT: movdqa %xmm3, %xmm4 -; X64-NEXT: pmulhuw %xmm1, %xmm4 -; X64-NEXT: pmullw %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_16xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <16 x i16>* %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 %tmp8 = zext <16 x i16> %wide.load to <16 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <16 x i16>* %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <16 x i32>* store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 ret void } ; %val1 = load <2 x i8> ; %op1 = sext<2 x i32> %val1 ; %val2 = load <2 x i8> ; %op2 = sext<2 x i32> %val2 ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi8_sext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movzwl (%edx,%ecx), %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movzwl (%eax,%ecx), %eax -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm1 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm1 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_sext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm1 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm1 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i8>* %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <2 x i8>* %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val1 = load <2 x i8> ; %op1 = sext<2 x i32> %val1 ; %val2 = load <2 x i8> ; %op2 = zext<2 x i32> %val2 ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi8_sext_zext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movzwl (%edx,%ecx), %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movzwl (%eax,%ecx), %eax -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_sext_zext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i8>* %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <2 x i8>* %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val1 = load <2 x i16> ; %op1 = sext<2 x i32> %val1 ; %val2 = load <2 x i16> ; %op2 = sext<2 x i32> %val2 ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi16_sext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_sext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i16>* %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <2 x i16>* %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val1 = load <2 x i16> ; %op1 = sext<2 x i32> %val1 ; %val2 = load <2 x i16> ; %op2 = zext<2 x i32> %val2 ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi16_sext_zext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: paddq %xmm2, %xmm3 -; X86-NEXT: psllq $32, %xmm3 -; X86-NEXT: pmuludq %xmm0, %xmm1 -; X86-NEXT: paddq %xmm3, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X86-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: psrlq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE-NEXT: paddq %xmm2, %xmm3 +; X86-SSE-NEXT: psllq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: paddq %xmm3, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_sext_zext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm2 -; X64-NEXT: pmuludq %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psrlq $32, %xmm3 -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: paddq %xmm2, %xmm3 -; X64-NEXT: psllq $32, %xmm3 -; X64-NEXT: pmuludq %xmm0, %xmm1 -; X64-NEXT: paddq %xmm3, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: psrlq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE-NEXT: paddq %xmm2, %xmm3 +; X64-SSE-NEXT: psllq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: paddq %xmm3, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i16>* %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <2 x i16>* %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val1 = load <16 x i16> ; %op1 = sext<16 x i32> %val1 ; %val2 = load <16 x i16> ; %op2 = sext<16 x i32> %val2 ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_16xi16_sext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: pmulhw %xmm0, %xmm4 -; X86-NEXT: pmullw %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-NEXT: movdqa %xmm3, %xmm4 -; X86-NEXT: pmulhw %xmm1, %xmm4 -; X86-NEXT: pmullw %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm3, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_16xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_16xi16_sext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm2 -; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm4 -; X64-NEXT: pmulhw %xmm0, %xmm4 -; X64-NEXT: pmullw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X64-NEXT: movdqa %xmm3, %xmm4 -; X64-NEXT: pmulhw %xmm1, %xmm4 -; X64-NEXT: pmullw %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_16xi16_sext: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1 +; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2 +; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3 +; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16_sext: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 +; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16_sext: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1 +; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2 +; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3 +; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16_sext: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 +; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <16 x i16>* %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 %tmp8 = sext <16 x i16> %wide.load to <16 x i32> %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index %tmp11 = bitcast i8* %tmp10 to <16 x i16>* %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <16 x i32>* store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i8> ; %op1 = zext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst1: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst1: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movl $255, %ecx +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i8>* %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i8> ; %op1 = sext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst2: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst2: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i8>* %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i8> ; %op1 = zext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst3: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst3: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100 +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i8>* %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i8> ; %op1 = zext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst4: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst4: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i8>* %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i8> ; %op1 = sext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst5: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst5: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst5: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst5: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst5: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst5: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i8>* %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i8> ; %op1 = sext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst6: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst6: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst6: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst6: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst6: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst6: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i8>* %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i16> ; %op1 = zext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst1: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhuw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_varconst1: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhuw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i16>* %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i16> ; %op1 = sext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst2: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_varconst2: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i16>* %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i16> ; %op1 = zext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst3: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: psrlq $32, %xmm0 -; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_varconst3: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-NEXT: movq %rcx, %xmm1 -; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm0 -; X64-NEXT: pmuludq %xmm1, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 +; X64-SSE-NEXT: movq %rcx, %xmm1 +; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000 +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i16>* %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; %val = load <2 x i16> ; %op1 = sext<2 x i32> %val ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst4: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: psrlq $32, %xmm0 -; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_varconst4: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movl $32768, %ecx # imm = 0x8000 -; X64-NEXT: movq %rcx, %xmm1 -; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm0 -; X64-NEXT: pmuludq %xmm1, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 +; X64-SSE-NEXT: movq %rcx, %xmm1 +; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000 +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index %tmp7 = bitcast i8* %tmp6 to <2 x i16>* %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> %tmp13 = mul nuw nsw <2 x i32> %tmp8, %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index %tmp15 = bitcast i32* %tmp14 to <2 x i32>* store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 ret void } ; ; Illegal Types ; define void @PR34947() { -; X86-LABEL: PR34947: -; X86: # %bb.0: -; X86-NEXT: movdqa (%eax), %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; X86-NEXT: movd %xmm1, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X86-NEXT: movd %xmm2, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl (%eax) -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm2, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-NEXT: pmuludq %xmm2, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-NEXT: movl $8199, %eax # imm = 0x2007 -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 -; X86-NEXT: movd %xmm2, (%eax) -; X86-NEXT: movdqa %xmm1, (%eax) -; X86-NEXT: retl +; X86-SSE-LABEL: PR34947: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa (%eax), %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; X86-SSE-NEXT: movd %xmm1, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE-NEXT: movd %xmm2, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE-NEXT: movd %xmm0, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl (%eax) +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-SSE-NEXT: movd %eax, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X86-SSE-NEXT: movd %xmm2, (%eax) +; X86-SSE-NEXT: movdqa %xmm1, (%eax) +; X86-SSE-NEXT: retl ; -; X64-LABEL: PR34947: -; X64: # %bb.0: -; X64-NEXT: movdqa (%rax), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl (%rax) -; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: pmuludq %xmm2, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: pmuludq %xmm2, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-NEXT: movl $8199, %eax # imm = 0x2007 -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pmuludq %xmm0, %xmm2 -; X64-NEXT: movd %xmm2, (%rax) -; X64-NEXT: movdqa %xmm1, (%rax) -; X64-NEXT: retq +; X86-AVX1-LABEL: PR34947: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: pushl %ebx +; X86-AVX1-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX1-NEXT: pushl %edi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 20 +; X86-AVX1-NEXT: subl $16, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 36 +; X86-AVX1-NEXT: .cfi_offset %esi, -20 +; X86-AVX1-NEXT: .cfi_offset %edi, -16 +; X86-AVX1-NEXT: .cfi_offset %ebx, -12 +; X86-AVX1-NEXT: .cfi_offset %ebp, -8 +; X86-AVX1-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl (%eax) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-AVX1-NEXT: vmovd %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ebp +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %esi +; X86-AVX1-NEXT: divl %esi +; X86-AVX1-NEXT: movl %edx, %esi +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edi +; X86-AVX1-NEXT: divl %edi +; X86-AVX1-NEXT: movl %edx, %edi +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vmovd %xmm0, %ebx +; X86-AVX1-NEXT: divl %ebx +; X86-AVX1-NEXT: vmovd %edx, %xmm0 +; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %ebp, %xmm1 +; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload +; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-AVX1-NEXT: vmovd %eax, %xmm3 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] +; X86-AVX1-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1 +; X86-AVX1-NEXT: vmovd %xmm1, (%eax) +; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: addl $16, %esp +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: popl %edi +; X86-AVX1-NEXT: popl %ebx +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: PR34947: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpextrd $1, %xmm1, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm1, %esi +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %esi +; X86-AVX2-NEXT: vmovd %edx, %xmm2 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1 +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm0, %esi +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %esi +; X86-AVX2-NEXT: vmovd %edx, %xmm2 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl (%eax) +; X86-AVX2-NEXT: vmovd %edx, %xmm1 +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X86-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-AVX2-NEXT: vmovd %eax, %xmm2 +; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vmovd %xmm1, (%eax) +; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: PR34947: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa (%rax), %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; X64-SSE-NEXT: movd %xmm1, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE-NEXT: movd %xmm2, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-SSE-NEXT: movd %xmm0, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-SSE-NEXT: movd %xmm0, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl (%rax) +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-SSE-NEXT: movd %eax, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X64-SSE-NEXT: movd %xmm2, (%rax) +; X64-SSE-NEXT: movdqa %xmm1, (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: PR34947: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rbp +; X64-AVX1-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX1-NEXT: pushq %rbx +; X64-AVX1-NEXT: .cfi_def_cfa_offset 24 +; X64-AVX1-NEXT: .cfi_offset %rbx, -24 +; X64-AVX1-NEXT: .cfi_offset %rbp, -16 +; X64-AVX1-NEXT: vmovdqa (%rax), %ymm0 +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl (%rax) +; X64-AVX1-NEXT: movl %edx, %r8d +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r9d +; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r10d +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r11d +; X64-AVX1-NEXT: vmovd %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %esi +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %edi +; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %ecx +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ebx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebx +; X64-AVX1-NEXT: movl %edx, %ebx +; X64-AVX1-NEXT: vmovd %xmm0, %ebp +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebp +; X64-AVX1-NEXT: vmovd %edx, %xmm0 +; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %esi, %xmm2 +; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovd %r8d, %xmm1 +; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-AVX1-NEXT: vmovd %eax, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vmovd %xmm1, (%rax) +; X64-AVX1-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX1-NEXT: popq %rbx +; X64-AVX1-NEXT: popq %rbp +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: PR34947: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rax), %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpextrd $1, %xmm1, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm1, %esi +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %esi +; X64-AVX2-NEXT: vmovd %edx, %xmm2 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1 +; X64-AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm0, %esi +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %esi +; X64-AVX2-NEXT: vmovd %edx, %xmm2 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl (%rax) +; X64-AVX2-NEXT: vmovd %edx, %xmm1 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X64-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-AVX2-NEXT: vmovd %eax, %xmm2 +; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovd %xmm1, (%rax) +; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %tmp = load <9 x i32>, <9 x i32>* undef, align 64 %rem = urem <9 x i32> zeroinitializer, %tmp %mul = mul <9 x i32> , %rem store <9 x i32> %mul, <9 x i32>* undef, align 64 ret void } Index: vendor/llvm/dist/test/CodeGen/X86/v8i1-masks.ll =================================================================== --- vendor/llvm/dist/test/CodeGen/X86/v8i1-masks.ll (revision 327151) +++ vendor/llvm/dist/test/CodeGen/X86/v8i1-masks.ll (revision 327152) @@ -1,73 +1,175 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X32 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X32-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64-AVX2 define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { ; X32-LABEL: and_masks: ; X32: ## %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: vmovups (%edx), %ymm0 ; X32-NEXT: vmovups (%ecx), %ymm1 ; X32-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vmovups (%eax), %ymm2 ; X32-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; X32-NEXT: vandps %ymm1, %ymm0, %ymm0 ; X32-NEXT: vandps LCPI0_0, %ymm0, %ymm0 ; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: and_masks: ; X64: ## %bb.0: ; X64-NEXT: vmovups (%rdi), %ymm0 ; X64-NEXT: vmovups (%rsi), %ymm1 ; X64-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vmovups (%rdx), %ymm2 ; X64-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vmovaps %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq +; +; X32-AVX2-LABEL: and_masks: +; X32-AVX2: ## %bb.0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX2-NEXT: vmovups (%edx), %ymm0 +; X32-AVX2-NEXT: vmovups (%ecx), %ymm1 +; X32-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovups (%eax), %ymm2 +; X32-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vmovaps %ymm0, (%eax) +; X32-AVX2-NEXT: vzeroupper +; X32-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: and_masks: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovups (%rsi), %ymm1 +; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovups (%rdx), %ymm2 +; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 16 %v1 = load <8 x float>, <8 x float>* %b, align 16 %m0 = fcmp olt <8 x float> %v1, %v0 %v2 = load <8 x float>, <8 x float>* %c, align 16 %m1 = fcmp olt <8 x float> %v2, %v0 %mand = and <8 x i1> %m1, %m0 %r = zext <8 x i1> %mand to <8 x i32> store <8 x i32> %r, <8 x i32>* undef, align 32 ret void } define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { ; X32-LABEL: neg_masks: ; X32: ## %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vmovups (%ecx), %ymm0 ; X32-NEXT: vcmpnltps (%eax), %ymm0, %ymm0 ; X32-NEXT: vandps LCPI1_0, %ymm0, %ymm0 ; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: neg_masks: ; X64: ## %bb.0: ; X64-NEXT: vmovups (%rsi), %ymm0 ; X64-NEXT: vcmpnltps (%rdi), %ymm0, %ymm0 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vmovaps %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq +; +; X32-AVX2-LABEL: neg_masks: +; X32-AVX2: ## %bb.0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX2-NEXT: vmovups (%ecx), %ymm0 +; X32-AVX2-NEXT: vcmpnltps (%eax), %ymm0, %ymm0 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X32-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vmovaps %ymm0, (%eax) +; X32-AVX2-NEXT: vzeroupper +; X32-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: neg_masks: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vmovups (%rsi), %ymm0 +; X64-AVX2-NEXT: vcmpnltps (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 16 %v1 = load <8 x float>, <8 x float>* %b, align 16 %m0 = fcmp olt <8 x float> %v1, %v0 %mand = xor <8 x i1> %m0, %r = zext <8 x i1> %mand to <8 x i32> store <8 x i32> %r, <8 x i32>* undef, align 32 ret void } +define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) { +; X32-LABEL: and_mask_constant: +; X32: ## %bb.0: +; X32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X32-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; X32-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; X32-NEXT: vpand LCPI2_0, %xmm0, %xmm0 +; X32-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: and_mask_constant: +; X64: ## %bb.0: +; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-NEXT: retq +; +; X32-AVX2-LABEL: and_mask_constant: +; X32-AVX2: ## %bb.0: +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpand LCPI2_0, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: and_mask_constant: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: retq + %m = icmp eq <8 x i32> %v0, zeroinitializer + %mand = and <8 x i1> %m, + %r = zext <8 x i1> %mand to <8 x i32> + ret <8 x i32> %r +} Index: vendor/llvm/dist/test/MC/ELF/comdat-name-number.s =================================================================== --- vendor/llvm/dist/test/MC/ELF/comdat-name-number.s (nonexistent) +++ vendor/llvm/dist/test/MC/ELF/comdat-name-number.s (revision 327152) @@ -0,0 +1,28 @@ +// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -filetype=obj -o %t.o +// RUN: llvm-readobj -elf-section-groups %t.o | FileCheck %s + +// Test that we can handle numeric COMDAT names. + +.section .foo,"G",@progbits,123,comdat +.section .bar,"G",@progbits,abc,comdat + +// CHECK: Groups { +// CHECK-NEXT: Group { +// CHECK-NEXT: Name: .group +// CHECK-NEXT: Index: +// CHECK-NEXT: Type: COMDAT +// CHECK-NEXT: Signature: 123 +// CHECK-NEXT: Section(s) in group [ +// CHECK-NEXT: .foo +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: Group { +// CHECK-NEXT: Name: .group +// CHECK-NEXT: Index: +// CHECK-NEXT: Type: COMDAT +// CHECK-NEXT: Signature: abc +// CHECK-NEXT: Section(s) in group [ +// CHECK-NEXT: .bar +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: } Property changes on: vendor/llvm/dist/test/MC/ELF/comdat-name-number.s ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property