Index: vendor/llvm/dist-release_90/include/llvm/Analysis/InstructionSimplify.h =================================================================== --- vendor/llvm/dist-release_90/include/llvm/Analysis/InstructionSimplify.h (revision 351708) +++ vendor/llvm/dist-release_90/include/llvm/Analysis/InstructionSimplify.h (revision 351709) @@ -1,296 +1,299 @@ //===-- InstructionSimplify.h - Fold instrs into simpler forms --*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file declares routines for folding instructions into simpler forms // that do not require creating new instructions. This does constant folding // ("add i32 1, 1" -> "2") but can also handle non-constant operands, either // returning a constant ("and i32 %x, 0" -> "0") or an already existing value // ("and i32 %x, %x" -> "%x"). If the simplification is also an instruction // then it dominates the original instruction. // // These routines implicitly resolve undef uses. The easiest way to be safe when // using these routines to obtain simplified values for existing instructions is // to always replace all uses of the instructions with the resulting simplified // values. This will prevent other code from seeing the same undef uses and // resolving them to different values. // // These routines are designed to tolerate moderately incomplete IR, such as // instructions that are not connected to basic blocks yet. However, they do // require that all the IR that they encounter be valid. In particular, they // require that all non-constant values be defined in the same function, and the // same call context of that function (and not split between caller and callee // contexts of a directly recursive call, for example). // //===----------------------------------------------------------------------===// #ifndef LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H #define LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H +#include "llvm/ADT/SetVector.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Operator.h" #include "llvm/IR/User.h" namespace llvm { class Function; template class AnalysisManager; template class ArrayRef; class AssumptionCache; class CallBase; class DominatorTree; class DataLayout; class FastMathFlags; struct LoopStandardAnalysisResults; class OptimizationRemarkEmitter; class Pass; class TargetLibraryInfo; class Type; class Value; class MDNode; class BinaryOperator; /// InstrInfoQuery provides an interface to query additional information for /// instructions like metadata or keywords like nsw, which provides conservative /// results if the users specified it is safe to use. struct InstrInfoQuery { InstrInfoQuery(bool UMD) : UseInstrInfo(UMD) {} InstrInfoQuery() : UseInstrInfo(true) {} bool UseInstrInfo = true; MDNode *getMetadata(const Instruction *I, unsigned KindID) const { if (UseInstrInfo) return I->getMetadata(KindID); return nullptr; } template bool hasNoUnsignedWrap(const InstT *Op) const { if (UseInstrInfo) return Op->hasNoUnsignedWrap(); return false; } template bool hasNoSignedWrap(const InstT *Op) const { if (UseInstrInfo) return Op->hasNoSignedWrap(); return false; } bool isExact(const BinaryOperator *Op) const { if (UseInstrInfo && isa(Op)) return cast(Op)->isExact(); return false; } }; struct SimplifyQuery { const DataLayout &DL; const TargetLibraryInfo *TLI = nullptr; const DominatorTree *DT = nullptr; AssumptionCache *AC = nullptr; const Instruction *CxtI = nullptr; // Wrapper to query additional information for instructions like metadata or // keywords like nsw, which provides conservative results if those cannot // be safely used. const InstrInfoQuery IIQ; SimplifyQuery(const DataLayout &DL, const Instruction *CXTI = nullptr) : DL(DL), CxtI(CXTI) {} SimplifyQuery(const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT = nullptr, AssumptionCache *AC = nullptr, const Instruction *CXTI = nullptr, bool UseInstrInfo = true) : DL(DL), TLI(TLI), DT(DT), AC(AC), CxtI(CXTI), IIQ(UseInstrInfo) {} SimplifyQuery getWithInstruction(Instruction *I) const { SimplifyQuery Copy(*this); Copy.CxtI = I; return Copy; } }; // NOTE: the explicit multiple argument versions of these functions are // deprecated. // Please use the SimplifyQuery versions in new code. /// Given operand for an FNeg, fold the result or return null. Value *SimplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q); /// Given operands for an Add, fold the result or return null. Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, const SimplifyQuery &Q); /// Given operands for a Sub, fold the result or return null. Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, const SimplifyQuery &Q); /// Given operands for an FAdd, fold the result or return null. Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q); /// Given operands for an FSub, fold the result or return null. Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q); /// Given operands for an FMul, fold the result or return null. Value *SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q); /// Given operands for a Mul, fold the result or return null. Value *SimplifyMulInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for an SDiv, fold the result or return null. Value *SimplifySDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for a UDiv, fold the result or return null. Value *SimplifyUDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for an FDiv, fold the result or return null. Value *SimplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q); /// Given operands for an SRem, fold the result or return null. Value *SimplifySRemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for a URem, fold the result or return null. Value *SimplifyURemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for an FRem, fold the result or return null. Value *SimplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q); /// Given operands for a Shl, fold the result or return null. Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const SimplifyQuery &Q); /// Given operands for a LShr, fold the result or return null. Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, const SimplifyQuery &Q); /// Given operands for a AShr, fold the result or return nulll. Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, const SimplifyQuery &Q); /// Given operands for an And, fold the result or return null. Value *SimplifyAndInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for an Or, fold the result or return null. Value *SimplifyOrInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for an Xor, fold the result or return null. Value *SimplifyXorInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for an ICmpInst, fold the result or return null. Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for an FCmpInst, fold the result or return null. Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q); /// Given operands for a SelectInst, fold the result or return null. Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, const SimplifyQuery &Q); /// Given operands for a GetElementPtrInst, fold the result or return null. Value *SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, const SimplifyQuery &Q); /// Given operands for an InsertValueInst, fold the result or return null. Value *SimplifyInsertValueInst(Value *Agg, Value *Val, ArrayRef Idxs, const SimplifyQuery &Q); /// Given operands for an InsertElement, fold the result or return null. Value *SimplifyInsertElementInst(Value *Vec, Value *Elt, Value *Idx, const SimplifyQuery &Q); /// Given operands for an ExtractValueInst, fold the result or return null. Value *SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, const SimplifyQuery &Q); /// Given operands for an ExtractElementInst, fold the result or return null. Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &Q); /// Given operands for a CastInst, fold the result or return null. Value *SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, const SimplifyQuery &Q); /// Given operands for a ShuffleVectorInst, fold the result or return null. Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, Type *RetTy, const SimplifyQuery &Q); //=== Helper functions for higher up the class hierarchy. /// Given operands for a CmpInst, fold the result or return null. Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operand for a UnaryOperator, fold the result or return null. Value *SimplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q); /// Given operand for an FP UnaryOperator, fold the result or return null. /// In contrast to SimplifyUnOp, try to use FastMathFlag when folding the /// result. In case we don't need FastMathFlags, simply fall to SimplifyUnOp. Value *SimplifyFPUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF, const SimplifyQuery &Q); /// Given operands for a BinaryOperator, fold the result or return null. Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q); /// Given operands for an FP BinaryOperator, fold the result or return null. /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp. Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q); /// Given a callsite, fold the result or return null. Value *SimplifyCall(CallBase *Call, const SimplifyQuery &Q); /// See if we can compute a simplified version of this instruction. If not, /// return null. Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q, OptimizationRemarkEmitter *ORE = nullptr); /// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively. /// /// This first performs a normal RAUW of I with SimpleV. It then recursively /// attempts to simplify those users updated by the operation. The 'I' /// instruction must not be equal to the simplified value 'SimpleV'. +/// If UnsimplifiedUsers is provided, instructions that could not be simplified +/// are added to it. /// /// The function returns true if any simplifications were performed. -bool replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV, - const TargetLibraryInfo *TLI = nullptr, - const DominatorTree *DT = nullptr, - AssumptionCache *AC = nullptr); +bool replaceAndRecursivelySimplify( + Instruction *I, Value *SimpleV, const TargetLibraryInfo *TLI = nullptr, + const DominatorTree *DT = nullptr, AssumptionCache *AC = nullptr, + SmallSetVector *UnsimplifiedUsers = nullptr); /// Recursively attempt to simplify an instruction. /// /// This routine uses SimplifyInstruction to simplify 'I', and if successful /// replaces uses of 'I' with the simplified value. It then recurses on each /// of the users impacted. It returns true if any simplifications were /// performed. bool recursivelySimplifyInstruction(Instruction *I, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, AssumptionCache *AC = nullptr); // These helper functions return a SimplifyQuery structure that contains as // many of the optional analysis we use as are currently valid. This is the // strongly preferred way of constructing SimplifyQuery in passes. const SimplifyQuery getBestSimplifyQuery(Pass &, Function &); template const SimplifyQuery getBestSimplifyQuery(AnalysisManager &, Function &); const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &, const DataLayout &); } // end namespace llvm #endif Index: vendor/llvm/dist-release_90/include/llvm/IR/InlineAsm.h =================================================================== --- vendor/llvm/dist-release_90/include/llvm/IR/InlineAsm.h (revision 351708) +++ vendor/llvm/dist-release_90/include/llvm/IR/InlineAsm.h (revision 351709) @@ -1,365 +1,366 @@ //===- llvm/InlineAsm.h - Class to represent inline asm strings -*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This class represents the inline asm strings, which are Value*'s that are // used as the callee operand of call instructions. InlineAsm's are uniqued // like constants, and created via InlineAsm::get(...). // //===----------------------------------------------------------------------===// #ifndef LLVM_IR_INLINEASM_H #define LLVM_IR_INLINEASM_H #include "llvm/ADT/StringRef.h" #include "llvm/IR/Value.h" #include #include #include namespace llvm { class FunctionType; class PointerType; template class ConstantUniqueMap; class InlineAsm final : public Value { public: enum AsmDialect { AD_ATT, AD_Intel }; private: friend struct InlineAsmKeyType; friend class ConstantUniqueMap; std::string AsmString, Constraints; FunctionType *FTy; bool HasSideEffects; bool IsAlignStack; AsmDialect Dialect; InlineAsm(FunctionType *Ty, const std::string &AsmString, const std::string &Constraints, bool hasSideEffects, bool isAlignStack, AsmDialect asmDialect); /// When the ConstantUniqueMap merges two types and makes two InlineAsms /// identical, it destroys one of them with this method. void destroyConstant(); public: InlineAsm(const InlineAsm &) = delete; InlineAsm &operator=(const InlineAsm &) = delete; /// InlineAsm::get - Return the specified uniqued inline asm string. /// static InlineAsm *get(FunctionType *Ty, StringRef AsmString, StringRef Constraints, bool hasSideEffects, bool isAlignStack = false, AsmDialect asmDialect = AD_ATT); bool hasSideEffects() const { return HasSideEffects; } bool isAlignStack() const { return IsAlignStack; } AsmDialect getDialect() const { return Dialect; } /// getType - InlineAsm's are always pointers. /// PointerType *getType() const { return reinterpret_cast(Value::getType()); } /// getFunctionType - InlineAsm's are always pointers to functions. /// FunctionType *getFunctionType() const; const std::string &getAsmString() const { return AsmString; } const std::string &getConstraintString() const { return Constraints; } /// Verify - This static method can be used by the parser to check to see if /// the specified constraint string is legal for the type. This returns true /// if legal, false if not. /// static bool Verify(FunctionType *Ty, StringRef Constraints); // Constraint String Parsing enum ConstraintPrefix { isInput, // 'x' isOutput, // '=x' isClobber // '~x' }; using ConstraintCodeVector = std::vector; struct SubConstraintInfo { /// MatchingInput - If this is not -1, this is an output constraint where an /// input constraint is required to match it (e.g. "0"). The value is the /// constraint number that matches this one (for example, if this is /// constraint #0 and constraint #4 has the value "0", this will be 4). int MatchingInput = -1; /// Code - The constraint code, either the register name (in braces) or the /// constraint letter/number. ConstraintCodeVector Codes; /// Default constructor. SubConstraintInfo() = default; }; using SubConstraintInfoVector = std::vector; struct ConstraintInfo; using ConstraintInfoVector = std::vector; struct ConstraintInfo { /// Type - The basic type of the constraint: input/output/clobber /// ConstraintPrefix Type = isInput; /// isEarlyClobber - "&": output operand writes result before inputs are all /// read. This is only ever set for an output operand. bool isEarlyClobber = false; /// MatchingInput - If this is not -1, this is an output constraint where an /// input constraint is required to match it (e.g. "0"). The value is the /// constraint number that matches this one (for example, if this is /// constraint #0 and constraint #4 has the value "0", this will be 4). int MatchingInput = -1; /// hasMatchingInput - Return true if this is an output constraint that has /// a matching input constraint. bool hasMatchingInput() const { return MatchingInput != -1; } /// isCommutative - This is set to true for a constraint that is commutative /// with the next operand. bool isCommutative = false; /// isIndirect - True if this operand is an indirect operand. This means /// that the address of the source or destination is present in the call /// instruction, instead of it being returned or passed in explicitly. This /// is represented with a '*' in the asm string. bool isIndirect = false; /// Code - The constraint code, either the register name (in braces) or the /// constraint letter/number. ConstraintCodeVector Codes; /// isMultipleAlternative - '|': has multiple-alternative constraints. bool isMultipleAlternative = false; /// multipleAlternatives - If there are multiple alternative constraints, /// this array will contain them. Otherwise it will be empty. SubConstraintInfoVector multipleAlternatives; /// The currently selected alternative constraint index. unsigned currentAlternativeIndex = 0; /// Default constructor. ConstraintInfo() = default; /// Parse - Analyze the specified string (e.g. "=*&{eax}") and fill in the /// fields in this structure. If the constraint string is not understood, /// return true, otherwise return false. bool Parse(StringRef Str, ConstraintInfoVector &ConstraintsSoFar); /// selectAlternative - Point this constraint to the alternative constraint /// indicated by the index. void selectAlternative(unsigned index); }; /// ParseConstraints - Split up the constraint string into the specific /// constraints and their prefixes. If this returns an empty vector, and if /// the constraint string itself isn't empty, there was an error parsing. static ConstraintInfoVector ParseConstraints(StringRef ConstraintString); /// ParseConstraints - Parse the constraints of this inlineasm object, /// returning them the same way that ParseConstraints(str) does. ConstraintInfoVector ParseConstraints() const { return ParseConstraints(Constraints); } // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Value *V) { return V->getValueID() == Value::InlineAsmVal; } // These are helper methods for dealing with flags in the INLINEASM SDNode // in the backend. // // The encoding of the flag word is currently: // Bits 2-0 - A Kind_* value indicating the kind of the operand. // Bits 15-3 - The number of SDNode operands associated with this inline // assembly operand. // If bit 31 is set: // Bit 30-16 - The operand number that this operand must match. // When bits 2-0 are Kind_Mem, the Constraint_* value must be // obtained from the flags for this operand number. // Else if bits 2-0 are Kind_Mem: // Bit 30-16 - A Constraint_* value indicating the original constraint // code. // Else: // Bit 30-16 - The register class ID to use for the operand. enum : uint32_t { // Fixed operands on an INLINEASM SDNode. Op_InputChain = 0, Op_AsmString = 1, Op_MDNode = 2, Op_ExtraInfo = 3, // HasSideEffects, IsAlignStack, AsmDialect. Op_FirstOperand = 4, // Fixed operands on an INLINEASM MachineInstr. MIOp_AsmString = 0, MIOp_ExtraInfo = 1, // HasSideEffects, IsAlignStack, AsmDialect. MIOp_FirstOperand = 2, // Interpretation of the MIOp_ExtraInfo bit field. Extra_HasSideEffects = 1, Extra_IsAlignStack = 2, Extra_AsmDialect = 4, Extra_MayLoad = 8, Extra_MayStore = 16, Extra_IsConvergent = 32, // Inline asm operands map to multiple SDNode / MachineInstr operands. // The first operand is an immediate describing the asm operand, the low // bits is the kind: Kind_RegUse = 1, // Input register, "r". Kind_RegDef = 2, // Output register, "=r". Kind_RegDefEarlyClobber = 3, // Early-clobber output register, "=&r". Kind_Clobber = 4, // Clobbered register, "~r". Kind_Imm = 5, // Immediate. Kind_Mem = 6, // Memory operand, "m". // Memory constraint codes. // These could be tablegenerated but there's little need to do that since // there's plenty of space in the encoding to support the union of all // constraint codes for all targets. Constraint_Unknown = 0, Constraint_es, Constraint_i, Constraint_m, Constraint_o, Constraint_v, + Constraint_A, Constraint_Q, Constraint_R, Constraint_S, Constraint_T, Constraint_Um, Constraint_Un, Constraint_Uq, Constraint_Us, Constraint_Ut, Constraint_Uv, Constraint_Uy, Constraint_X, Constraint_Z, Constraint_ZC, Constraint_Zy, Constraints_Max = Constraint_Zy, Constraints_ShiftAmount = 16, Flag_MatchingOperand = 0x80000000 }; static unsigned getFlagWord(unsigned Kind, unsigned NumOps) { assert(((NumOps << 3) & ~0xffff) == 0 && "Too many inline asm operands!"); assert(Kind >= Kind_RegUse && Kind <= Kind_Mem && "Invalid Kind"); return Kind | (NumOps << 3); } static bool isRegDefKind(unsigned Flag){ return getKind(Flag) == Kind_RegDef;} static bool isImmKind(unsigned Flag) { return getKind(Flag) == Kind_Imm; } static bool isMemKind(unsigned Flag) { return getKind(Flag) == Kind_Mem; } static bool isRegDefEarlyClobberKind(unsigned Flag) { return getKind(Flag) == Kind_RegDefEarlyClobber; } static bool isClobberKind(unsigned Flag) { return getKind(Flag) == Kind_Clobber; } /// getFlagWordForMatchingOp - Augment an existing flag word returned by /// getFlagWord with information indicating that this input operand is tied /// to a previous output operand. static unsigned getFlagWordForMatchingOp(unsigned InputFlag, unsigned MatchedOperandNo) { assert(MatchedOperandNo <= 0x7fff && "Too big matched operand"); assert((InputFlag & ~0xffff) == 0 && "High bits already contain data"); return InputFlag | Flag_MatchingOperand | (MatchedOperandNo << 16); } /// getFlagWordForRegClass - Augment an existing flag word returned by /// getFlagWord with the required register class for the following register /// operands. /// A tied use operand cannot have a register class, use the register class /// from the def operand instead. static unsigned getFlagWordForRegClass(unsigned InputFlag, unsigned RC) { // Store RC + 1, reserve the value 0 to mean 'no register class'. ++RC; assert(!isImmKind(InputFlag) && "Immediates cannot have a register class"); assert(!isMemKind(InputFlag) && "Memory operand cannot have a register class"); assert(RC <= 0x7fff && "Too large register class ID"); assert((InputFlag & ~0xffff) == 0 && "High bits already contain data"); return InputFlag | (RC << 16); } /// Augment an existing flag word returned by getFlagWord with the constraint /// code for a memory constraint. static unsigned getFlagWordForMem(unsigned InputFlag, unsigned Constraint) { assert(isMemKind(InputFlag) && "InputFlag is not a memory constraint!"); assert(Constraint <= 0x7fff && "Too large a memory constraint ID"); assert(Constraint <= Constraints_Max && "Unknown constraint ID"); assert((InputFlag & ~0xffff) == 0 && "High bits already contain data"); return InputFlag | (Constraint << Constraints_ShiftAmount); } static unsigned convertMemFlagWordToMatchingFlagWord(unsigned InputFlag) { assert(isMemKind(InputFlag)); return InputFlag & ~(0x7fff << Constraints_ShiftAmount); } static unsigned getKind(unsigned Flags) { return Flags & 7; } static unsigned getMemoryConstraintID(unsigned Flag) { assert(isMemKind(Flag)); return (Flag >> Constraints_ShiftAmount) & 0x7fff; } /// getNumOperandRegisters - Extract the number of registers field from the /// inline asm operand flag. static unsigned getNumOperandRegisters(unsigned Flag) { return (Flag & 0xffff) >> 3; } /// isUseOperandTiedToDef - Return true if the flag of the inline asm /// operand indicates it is an use operand that's matched to a def operand. static bool isUseOperandTiedToDef(unsigned Flag, unsigned &Idx) { if ((Flag & Flag_MatchingOperand) == 0) return false; Idx = (Flag & ~Flag_MatchingOperand) >> 16; return true; } /// hasRegClassConstraint - Returns true if the flag contains a register /// class constraint. Sets RC to the register class ID. static bool hasRegClassConstraint(unsigned Flag, unsigned &RC) { if (Flag & Flag_MatchingOperand) return false; unsigned High = Flag >> 16; // getFlagWordForRegClass() uses 0 to mean no register class, and otherwise // stores RC + 1. if (!High) return false; RC = High - 1; return true; } }; } // end namespace llvm #endif // LLVM_IR_INLINEASM_H Index: vendor/llvm/dist-release_90/lib/Analysis/InstructionSimplify.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Analysis/InstructionSimplify.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Analysis/InstructionSimplify.cpp (revision 351709) @@ -1,5326 +1,5332 @@ //===- InstructionSimplify.cpp - Fold instruction operands ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements routines for folding instructions into simpler forms // that do not require creating new instructions. This does constant folding // ("add i32 1, 1" -> "2") but can also handle non-constant operands, either // returning a constant ("and i32 %x, 0" -> "0") or an already existing value // ("and i32 %x, %x" -> "%x"). All operands are assumed to have already been // simplified: This is usually true and assuming it simplifies the logic (if // they have not been simplified then results are correct but maybe suboptimal). // //===----------------------------------------------------------------------===// #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/KnownBits.h" #include using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "instsimplify" enum { RecursionLimit = 3 }; STATISTIC(NumExpand, "Number of expansions"); STATISTIC(NumReassoc, "Number of reassociations"); static Value *SimplifyAndInst(Value *, Value *, const SimplifyQuery &, unsigned); static Value *simplifyUnOp(unsigned, Value *, const SimplifyQuery &, unsigned); static Value *simplifyFPUnOp(unsigned, Value *, const FastMathFlags &, const SimplifyQuery &, unsigned); static Value *SimplifyBinOp(unsigned, Value *, Value *, const SimplifyQuery &, unsigned); static Value *SimplifyFPBinOp(unsigned, Value *, Value *, const FastMathFlags &, const SimplifyQuery &, unsigned); static Value *SimplifyCmpInst(unsigned, Value *, Value *, const SimplifyQuery &, unsigned); static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse); static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned); static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned); static Value *SimplifyCastInst(unsigned, Value *, Type *, const SimplifyQuery &, unsigned); static Value *SimplifyGEPInst(Type *, ArrayRef, const SimplifyQuery &, unsigned); static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal, Value *FalseVal) { BinaryOperator::BinaryOps BinOpCode; if (auto *BO = dyn_cast(Cond)) BinOpCode = BO->getOpcode(); else return nullptr; CmpInst::Predicate ExpectedPred, Pred1, Pred2; if (BinOpCode == BinaryOperator::Or) { ExpectedPred = ICmpInst::ICMP_NE; } else if (BinOpCode == BinaryOperator::And) { ExpectedPred = ICmpInst::ICMP_EQ; } else return nullptr; // %A = icmp eq %TV, %FV // %B = icmp eq %X, %Y (and one of these is a select operand) // %C = and %A, %B // %D = select %C, %TV, %FV // --> // %FV // %A = icmp ne %TV, %FV // %B = icmp ne %X, %Y (and one of these is a select operand) // %C = or %A, %B // %D = select %C, %TV, %FV // --> // %TV Value *X, *Y; if (!match(Cond, m_c_BinOp(m_c_ICmp(Pred1, m_Specific(TrueVal), m_Specific(FalseVal)), m_ICmp(Pred2, m_Value(X), m_Value(Y)))) || Pred1 != Pred2 || Pred1 != ExpectedPred) return nullptr; if (X == TrueVal || X == FalseVal || Y == TrueVal || Y == FalseVal) return BinOpCode == BinaryOperator::Or ? TrueVal : FalseVal; return nullptr; } /// For a boolean type or a vector of boolean type, return false or a vector /// with every element false. static Constant *getFalse(Type *Ty) { return ConstantInt::getFalse(Ty); } /// For a boolean type or a vector of boolean type, return true or a vector /// with every element true. static Constant *getTrue(Type *Ty) { return ConstantInt::getTrue(Ty); } /// isSameCompare - Is V equivalent to the comparison "LHS Pred RHS"? static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS, Value *RHS) { CmpInst *Cmp = dyn_cast(V); if (!Cmp) return false; CmpInst::Predicate CPred = Cmp->getPredicate(); Value *CLHS = Cmp->getOperand(0), *CRHS = Cmp->getOperand(1); if (CPred == Pred && CLHS == LHS && CRHS == RHS) return true; return CPred == CmpInst::getSwappedPredicate(Pred) && CLHS == RHS && CRHS == LHS; } /// Does the given value dominate the specified phi node? static bool valueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) { Instruction *I = dyn_cast(V); if (!I) // Arguments and constants dominate all instructions. return true; // If we are processing instructions (and/or basic blocks) that have not been // fully added to a function, the parent nodes may still be null. Simply // return the conservative answer in these cases. if (!I->getParent() || !P->getParent() || !I->getFunction()) return false; // If we have a DominatorTree then do a precise test. if (DT) return DT->dominates(I, P); // Otherwise, if the instruction is in the entry block and is not an invoke, // then it obviously dominates all phi nodes. if (I->getParent() == &I->getFunction()->getEntryBlock() && !isa(I)) return true; return false; } /// Simplify "A op (B op' C)" by distributing op over op', turning it into /// "(A op B) op' (A op C)". Here "op" is given by Opcode and "op'" is /// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS. /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)". /// Returns the simplified value, or null if no simplification was performed. static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS, Instruction::BinaryOps OpcodeToExpand, const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) return nullptr; // Check whether the expression has the form "(A op' B) op C". if (BinaryOperator *Op0 = dyn_cast(LHS)) if (Op0->getOpcode() == OpcodeToExpand) { // It does! Try turning it into "(A op C) op' (B op C)". Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS; // Do "A op C" and "B op C" both simplify? if (Value *L = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse)) if (Value *R = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) { // They do! Return "L op' R" if it simplifies or is already available. // If "L op' R" equals "A op' B" then "L op' R" is just the LHS. if ((L == A && R == B) || (Instruction::isCommutative(OpcodeToExpand) && L == B && R == A)) { ++NumExpand; return LHS; } // Otherwise return "L op' R" if it simplifies. if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) { ++NumExpand; return V; } } } // Check whether the expression has the form "A op (B op' C)". if (BinaryOperator *Op1 = dyn_cast(RHS)) if (Op1->getOpcode() == OpcodeToExpand) { // It does! Try turning it into "(A op B) op' (A op C)". Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1); // Do "A op B" and "A op C" both simplify? if (Value *L = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse)) if (Value *R = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse)) { // They do! Return "L op' R" if it simplifies or is already available. // If "L op' R" equals "B op' C" then "L op' R" is just the RHS. if ((L == B && R == C) || (Instruction::isCommutative(OpcodeToExpand) && L == C && R == B)) { ++NumExpand; return RHS; } // Otherwise return "L op' R" if it simplifies. if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) { ++NumExpand; return V; } } } return nullptr; } /// Generic simplifications for associative binary operations. /// Returns the simpler value, or null if none was found. static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { assert(Instruction::isAssociative(Opcode) && "Not an associative operation!"); // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) return nullptr; BinaryOperator *Op0 = dyn_cast(LHS); BinaryOperator *Op1 = dyn_cast(RHS); // Transform: "(A op B) op C" ==> "A op (B op C)" if it simplifies completely. if (Op0 && Op0->getOpcode() == Opcode) { Value *A = Op0->getOperand(0); Value *B = Op0->getOperand(1); Value *C = RHS; // Does "B op C" simplify? if (Value *V = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) { // It does! Return "A op V" if it simplifies or is already available. // If V equals B then "A op V" is just the LHS. if (V == B) return LHS; // Otherwise return "A op V" if it simplifies. if (Value *W = SimplifyBinOp(Opcode, A, V, Q, MaxRecurse)) { ++NumReassoc; return W; } } } // Transform: "A op (B op C)" ==> "(A op B) op C" if it simplifies completely. if (Op1 && Op1->getOpcode() == Opcode) { Value *A = LHS; Value *B = Op1->getOperand(0); Value *C = Op1->getOperand(1); // Does "A op B" simplify? if (Value *V = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse)) { // It does! Return "V op C" if it simplifies or is already available. // If V equals B then "V op C" is just the RHS. if (V == B) return RHS; // Otherwise return "V op C" if it simplifies. if (Value *W = SimplifyBinOp(Opcode, V, C, Q, MaxRecurse)) { ++NumReassoc; return W; } } } // The remaining transforms require commutativity as well as associativity. if (!Instruction::isCommutative(Opcode)) return nullptr; // Transform: "(A op B) op C" ==> "(C op A) op B" if it simplifies completely. if (Op0 && Op0->getOpcode() == Opcode) { Value *A = Op0->getOperand(0); Value *B = Op0->getOperand(1); Value *C = RHS; // Does "C op A" simplify? if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) { // It does! Return "V op B" if it simplifies or is already available. // If V equals A then "V op B" is just the LHS. if (V == A) return LHS; // Otherwise return "V op B" if it simplifies. if (Value *W = SimplifyBinOp(Opcode, V, B, Q, MaxRecurse)) { ++NumReassoc; return W; } } } // Transform: "A op (B op C)" ==> "B op (C op A)" if it simplifies completely. if (Op1 && Op1->getOpcode() == Opcode) { Value *A = LHS; Value *B = Op1->getOperand(0); Value *C = Op1->getOperand(1); // Does "C op A" simplify? if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) { // It does! Return "B op V" if it simplifies or is already available. // If V equals C then "B op V" is just the RHS. if (V == C) return RHS; // Otherwise return "B op V" if it simplifies. if (Value *W = SimplifyBinOp(Opcode, B, V, Q, MaxRecurse)) { ++NumReassoc; return W; } } } return nullptr; } /// In the case of a binary operation with a select instruction as an operand, /// try to simplify the binop by seeing whether evaluating it on both branches /// of the select results in the same value. Returns the common value if so, /// otherwise returns null. static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) return nullptr; SelectInst *SI; if (isa(LHS)) { SI = cast(LHS); } else { assert(isa(RHS) && "No select instruction operand!"); SI = cast(RHS); } // Evaluate the BinOp on the true and false branches of the select. Value *TV; Value *FV; if (SI == LHS) { TV = SimplifyBinOp(Opcode, SI->getTrueValue(), RHS, Q, MaxRecurse); FV = SimplifyBinOp(Opcode, SI->getFalseValue(), RHS, Q, MaxRecurse); } else { TV = SimplifyBinOp(Opcode, LHS, SI->getTrueValue(), Q, MaxRecurse); FV = SimplifyBinOp(Opcode, LHS, SI->getFalseValue(), Q, MaxRecurse); } // If they simplified to the same value, then return the common value. // If they both failed to simplify then return null. if (TV == FV) return TV; // If one branch simplified to undef, return the other one. if (TV && isa(TV)) return FV; if (FV && isa(FV)) return TV; // If applying the operation did not change the true and false select values, // then the result of the binop is the select itself. if (TV == SI->getTrueValue() && FV == SI->getFalseValue()) return SI; // If one branch simplified and the other did not, and the simplified // value is equal to the unsimplified one, return the simplified value. // For example, select (cond, X, X & Z) & Z -> X & Z. if ((FV && !TV) || (TV && !FV)) { // Check that the simplified value has the form "X op Y" where "op" is the // same as the original operation. Instruction *Simplified = dyn_cast(FV ? FV : TV); if (Simplified && Simplified->getOpcode() == unsigned(Opcode)) { // The value that didn't simplify is "UnsimplifiedLHS op UnsimplifiedRHS". // We already know that "op" is the same as for the simplified value. See // if the operands match too. If so, return the simplified value. Value *UnsimplifiedBranch = FV ? SI->getTrueValue() : SI->getFalseValue(); Value *UnsimplifiedLHS = SI == LHS ? UnsimplifiedBranch : LHS; Value *UnsimplifiedRHS = SI == LHS ? RHS : UnsimplifiedBranch; if (Simplified->getOperand(0) == UnsimplifiedLHS && Simplified->getOperand(1) == UnsimplifiedRHS) return Simplified; if (Simplified->isCommutative() && Simplified->getOperand(1) == UnsimplifiedLHS && Simplified->getOperand(0) == UnsimplifiedRHS) return Simplified; } } return nullptr; } /// In the case of a comparison with a select instruction, try to simplify the /// comparison by seeing whether both branches of the select result in the same /// value. Returns the common value if so, otherwise returns null. static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) return nullptr; // Make sure the select is on the LHS. if (!isa(LHS)) { std::swap(LHS, RHS); Pred = CmpInst::getSwappedPredicate(Pred); } assert(isa(LHS) && "Not comparing with a select instruction!"); SelectInst *SI = cast(LHS); Value *Cond = SI->getCondition(); Value *TV = SI->getTrueValue(); Value *FV = SI->getFalseValue(); // Now that we have "cmp select(Cond, TV, FV), RHS", analyse it. // Does "cmp TV, RHS" simplify? Value *TCmp = SimplifyCmpInst(Pred, TV, RHS, Q, MaxRecurse); if (TCmp == Cond) { // It not only simplified, it simplified to the select condition. Replace // it with 'true'. TCmp = getTrue(Cond->getType()); } else if (!TCmp) { // It didn't simplify. However if "cmp TV, RHS" is equal to the select // condition then we can replace it with 'true'. Otherwise give up. if (!isSameCompare(Cond, Pred, TV, RHS)) return nullptr; TCmp = getTrue(Cond->getType()); } // Does "cmp FV, RHS" simplify? Value *FCmp = SimplifyCmpInst(Pred, FV, RHS, Q, MaxRecurse); if (FCmp == Cond) { // It not only simplified, it simplified to the select condition. Replace // it with 'false'. FCmp = getFalse(Cond->getType()); } else if (!FCmp) { // It didn't simplify. However if "cmp FV, RHS" is equal to the select // condition then we can replace it with 'false'. Otherwise give up. if (!isSameCompare(Cond, Pred, FV, RHS)) return nullptr; FCmp = getFalse(Cond->getType()); } // If both sides simplified to the same value, then use it as the result of // the original comparison. if (TCmp == FCmp) return TCmp; // The remaining cases only make sense if the select condition has the same // type as the result of the comparison, so bail out if this is not so. if (Cond->getType()->isVectorTy() != RHS->getType()->isVectorTy()) return nullptr; // If the false value simplified to false, then the result of the compare // is equal to "Cond && TCmp". This also catches the case when the false // value simplified to false and the true value to true, returning "Cond". if (match(FCmp, m_Zero())) if (Value *V = SimplifyAndInst(Cond, TCmp, Q, MaxRecurse)) return V; // If the true value simplified to true, then the result of the compare // is equal to "Cond || FCmp". if (match(TCmp, m_One())) if (Value *V = SimplifyOrInst(Cond, FCmp, Q, MaxRecurse)) return V; // Finally, if the false value simplified to true and the true value to // false, then the result of the compare is equal to "!Cond". if (match(FCmp, m_One()) && match(TCmp, m_Zero())) if (Value *V = SimplifyXorInst(Cond, Constant::getAllOnesValue(Cond->getType()), Q, MaxRecurse)) return V; return nullptr; } /// In the case of a binary operation with an operand that is a PHI instruction, /// try to simplify the binop by seeing whether evaluating it on the incoming /// phi values yields the same result for every value. If so returns the common /// value, otherwise returns null. static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) return nullptr; PHINode *PI; if (isa(LHS)) { PI = cast(LHS); // Bail out if RHS and the phi may be mutually interdependent due to a loop. if (!valueDominatesPHI(RHS, PI, Q.DT)) return nullptr; } else { assert(isa(RHS) && "No PHI instruction operand!"); PI = cast(RHS); // Bail out if LHS and the phi may be mutually interdependent due to a loop. if (!valueDominatesPHI(LHS, PI, Q.DT)) return nullptr; } // Evaluate the BinOp on the incoming phi values. Value *CommonValue = nullptr; for (Value *Incoming : PI->incoming_values()) { // If the incoming value is the phi node itself, it can safely be skipped. if (Incoming == PI) continue; Value *V = PI == LHS ? SimplifyBinOp(Opcode, Incoming, RHS, Q, MaxRecurse) : SimplifyBinOp(Opcode, LHS, Incoming, Q, MaxRecurse); // If the operation failed to simplify, or simplified to a different value // to previously, then give up. if (!V || (CommonValue && V != CommonValue)) return nullptr; CommonValue = V; } return CommonValue; } /// In the case of a comparison with a PHI instruction, try to simplify the /// comparison by seeing whether comparing with all of the incoming phi values /// yields the same result every time. If so returns the common result, /// otherwise returns null. static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) return nullptr; // Make sure the phi is on the LHS. if (!isa(LHS)) { std::swap(LHS, RHS); Pred = CmpInst::getSwappedPredicate(Pred); } assert(isa(LHS) && "Not comparing with a phi instruction!"); PHINode *PI = cast(LHS); // Bail out if RHS and the phi may be mutually interdependent due to a loop. if (!valueDominatesPHI(RHS, PI, Q.DT)) return nullptr; // Evaluate the BinOp on the incoming phi values. Value *CommonValue = nullptr; for (Value *Incoming : PI->incoming_values()) { // If the incoming value is the phi node itself, it can safely be skipped. if (Incoming == PI) continue; Value *V = SimplifyCmpInst(Pred, Incoming, RHS, Q, MaxRecurse); // If the operation failed to simplify, or simplified to a different value // to previously, then give up. if (!V || (CommonValue && V != CommonValue)) return nullptr; CommonValue = V; } return CommonValue; } static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode, Value *&Op0, Value *&Op1, const SimplifyQuery &Q) { if (auto *CLHS = dyn_cast(Op0)) { if (auto *CRHS = dyn_cast(Op1)) return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL); // Canonicalize the constant to the RHS if this is a commutative operation. if (Instruction::isCommutative(Opcode)) std::swap(Op0, Op1); } return nullptr; } /// Given operands for an Add, see if we can fold the result. /// If not, this returns null. static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q)) return C; // X + undef -> undef if (match(Op1, m_Undef())) return Op1; // X + 0 -> X if (match(Op1, m_Zero())) return Op0; // If two operands are negative, return 0. if (isKnownNegation(Op0, Op1)) return Constant::getNullValue(Op0->getType()); // X + (Y - X) -> Y // (Y - X) + X -> Y // Eg: X + -X -> 0 Value *Y = nullptr; if (match(Op1, m_Sub(m_Value(Y), m_Specific(Op0))) || match(Op0, m_Sub(m_Value(Y), m_Specific(Op1)))) return Y; // X + ~X -> -1 since ~X = -X-1 Type *Ty = Op0->getType(); if (match(Op0, m_Not(m_Specific(Op1))) || match(Op1, m_Not(m_Specific(Op0)))) return Constant::getAllOnesValue(Ty); // add nsw/nuw (xor Y, signmask), signmask --> Y // The no-wrapping add guarantees that the top bit will be set by the add. // Therefore, the xor must be clearing the already set sign bit of Y. if ((IsNSW || IsNUW) && match(Op1, m_SignMask()) && match(Op0, m_Xor(m_Value(Y), m_SignMask()))) return Y; // add nuw %x, -1 -> -1, because %x can only be 0. if (IsNUW && match(Op1, m_AllOnes())) return Op1; // Which is -1. /// i1 add -> xor. if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1)) if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1)) return V; // Try some generic simplifications for associative operations. if (Value *V = SimplifyAssociativeBinOp(Instruction::Add, Op0, Op1, Q, MaxRecurse)) return V; // Threading Add over selects and phi nodes is pointless, so don't bother. // Threading over the select in "A + select(cond, B, C)" means evaluating // "A+B" and "A+C" and seeing if they are equal; but they are equal if and // only if B and C are equal. If B and C are equal then (since we assume // that operands have already been simplified) "select(cond, B, C)" should // have been simplified to the common value of B and C already. Analysing // "A+B" and "A+C" thus gains nothing, but costs compile time. Similarly // for threading over phi nodes. return nullptr; } Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW, const SimplifyQuery &Query) { return ::SimplifyAddInst(Op0, Op1, IsNSW, IsNUW, Query, RecursionLimit); } /// Compute the base pointer and cumulative constant offsets for V. /// /// This strips all constant offsets off of V, leaving it the base pointer, and /// accumulates the total constant offset applied in the returned constant. It /// returns 0 if V is not a pointer, and returns the constant '0' if there are /// no constant offsets applied. /// /// This is very similar to GetPointerBaseWithConstantOffset except it doesn't /// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc. /// folding. static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V, bool AllowNonInbounds = false) { assert(V->getType()->isPtrOrPtrVectorTy()); Type *IntPtrTy = DL.getIntPtrType(V->getType())->getScalarType(); APInt Offset = APInt::getNullValue(IntPtrTy->getIntegerBitWidth()); V = V->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds); // As that strip may trace through `addrspacecast`, need to sext or trunc // the offset calculated. IntPtrTy = DL.getIntPtrType(V->getType())->getScalarType(); Offset = Offset.sextOrTrunc(IntPtrTy->getIntegerBitWidth()); Constant *OffsetIntPtr = ConstantInt::get(IntPtrTy, Offset); if (V->getType()->isVectorTy()) return ConstantVector::getSplat(V->getType()->getVectorNumElements(), OffsetIntPtr); return OffsetIntPtr; } /// Compute the constant difference between two pointer values. /// If the difference is not a constant, returns zero. static Constant *computePointerDifference(const DataLayout &DL, Value *LHS, Value *RHS) { Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS); Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS); // If LHS and RHS are not related via constant offsets to the same base // value, there is nothing we can do here. if (LHS != RHS) return nullptr; // Otherwise, the difference of LHS - RHS can be computed as: // LHS - RHS // = (LHSOffset + Base) - (RHSOffset + Base) // = LHSOffset - RHSOffset return ConstantExpr::getSub(LHSOffset, RHSOffset); } /// Given operands for a Sub, see if we can fold the result. /// If not, this returns null. static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Sub, Op0, Op1, Q)) return C; // X - undef -> undef // undef - X -> undef if (match(Op0, m_Undef()) || match(Op1, m_Undef())) return UndefValue::get(Op0->getType()); // X - 0 -> X if (match(Op1, m_Zero())) return Op0; // X - X -> 0 if (Op0 == Op1) return Constant::getNullValue(Op0->getType()); // Is this a negation? if (match(Op0, m_Zero())) { // 0 - X -> 0 if the sub is NUW. if (isNUW) return Constant::getNullValue(Op0->getType()); KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (Known.Zero.isMaxSignedValue()) { // Op1 is either 0 or the minimum signed value. If the sub is NSW, then // Op1 must be 0 because negating the minimum signed value is undefined. if (isNSW) return Constant::getNullValue(Op0->getType()); // 0 - X -> X if X is 0 or the minimum signed value. return Op1; } } // (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies. // For example, (X + Y) - Y -> X; (Y + X) - Y -> X Value *X = nullptr, *Y = nullptr, *Z = Op1; if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z // See if "V === Y - Z" simplifies. if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1)) // It does! Now see if "X + V" simplifies. if (Value *W = SimplifyBinOp(Instruction::Add, X, V, Q, MaxRecurse-1)) { // It does, we successfully reassociated! ++NumReassoc; return W; } // See if "V === X - Z" simplifies. if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1)) // It does! Now see if "Y + V" simplifies. if (Value *W = SimplifyBinOp(Instruction::Add, Y, V, Q, MaxRecurse-1)) { // It does, we successfully reassociated! ++NumReassoc; return W; } } // X - (Y + Z) -> (X - Y) - Z or (X - Z) - Y if everything simplifies. // For example, X - (X + 1) -> -1 X = Op0; if (MaxRecurse && match(Op1, m_Add(m_Value(Y), m_Value(Z)))) { // X - (Y + Z) // See if "V === X - Y" simplifies. if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1)) // It does! Now see if "V - Z" simplifies. if (Value *W = SimplifyBinOp(Instruction::Sub, V, Z, Q, MaxRecurse-1)) { // It does, we successfully reassociated! ++NumReassoc; return W; } // See if "V === X - Z" simplifies. if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1)) // It does! Now see if "V - Y" simplifies. if (Value *W = SimplifyBinOp(Instruction::Sub, V, Y, Q, MaxRecurse-1)) { // It does, we successfully reassociated! ++NumReassoc; return W; } } // Z - (X - Y) -> (Z - X) + Y if everything simplifies. // For example, X - (X - Y) -> Y. Z = Op0; if (MaxRecurse && match(Op1, m_Sub(m_Value(X), m_Value(Y)))) // Z - (X - Y) // See if "V === Z - X" simplifies. if (Value *V = SimplifyBinOp(Instruction::Sub, Z, X, Q, MaxRecurse-1)) // It does! Now see if "V + Y" simplifies. if (Value *W = SimplifyBinOp(Instruction::Add, V, Y, Q, MaxRecurse-1)) { // It does, we successfully reassociated! ++NumReassoc; return W; } // trunc(X) - trunc(Y) -> trunc(X - Y) if everything simplifies. if (MaxRecurse && match(Op0, m_Trunc(m_Value(X))) && match(Op1, m_Trunc(m_Value(Y)))) if (X->getType() == Y->getType()) // See if "V === X - Y" simplifies. if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1)) // It does! Now see if "trunc V" simplifies. if (Value *W = SimplifyCastInst(Instruction::Trunc, V, Op0->getType(), Q, MaxRecurse - 1)) // It does, return the simplified "trunc V". return W; // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...). if (match(Op0, m_PtrToInt(m_Value(X))) && match(Op1, m_PtrToInt(m_Value(Y)))) if (Constant *Result = computePointerDifference(Q.DL, X, Y)) return ConstantExpr::getIntegerCast(Result, Op0->getType(), true); // i1 sub -> xor. if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1)) if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1)) return V; // Threading Sub over selects and phi nodes is pointless, so don't bother. // Threading over the select in "A - select(cond, B, C)" means evaluating // "A-B" and "A-C" and seeing if they are equal; but they are equal if and // only if B and C are equal. If B and C are equal then (since we assume // that operands have already been simplified) "select(cond, B, C)" should // have been simplified to the common value of B and C already. Analysing // "A-B" and "A-C" thus gains nothing, but costs compile time. Similarly // for threading over phi nodes. return nullptr; } Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const SimplifyQuery &Q) { return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit); } /// Given operands for a Mul, see if we can fold the result. /// If not, this returns null. static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Mul, Op0, Op1, Q)) return C; // X * undef -> 0 // X * 0 -> 0 if (match(Op1, m_CombineOr(m_Undef(), m_Zero()))) return Constant::getNullValue(Op0->getType()); // X * 1 -> X if (match(Op1, m_One())) return Op0; // (X / Y) * Y -> X if the division is exact. Value *X = nullptr; if (Q.IIQ.UseInstrInfo && (match(Op0, m_Exact(m_IDiv(m_Value(X), m_Specific(Op1)))) || // (X / Y) * Y match(Op1, m_Exact(m_IDiv(m_Value(X), m_Specific(Op0)))))) // Y * (X / Y) return X; // i1 mul -> and. if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1)) if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1)) return V; // Try some generic simplifications for associative operations. if (Value *V = SimplifyAssociativeBinOp(Instruction::Mul, Op0, Op1, Q, MaxRecurse)) return V; // Mul distributes over Add. Try some generic simplifications based on this. if (Value *V = ExpandBinOp(Instruction::Mul, Op0, Op1, Instruction::Add, Q, MaxRecurse)) return V; // If the operation is with the result of a select instruction, check whether // operating on either branch of the select always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverSelect(Instruction::Mul, Op0, Op1, Q, MaxRecurse)) return V; // If the operation is with the result of a phi instruction, check whether // operating on all incoming values of the phi always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverPHI(Instruction::Mul, Op0, Op1, Q, MaxRecurse)) return V; return nullptr; } Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { return ::SimplifyMulInst(Op0, Op1, Q, RecursionLimit); } /// Check for common or similar folds of integer division or integer remainder. /// This applies to all 4 opcodes (sdiv/udiv/srem/urem). static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) { Type *Ty = Op0->getType(); // X / undef -> undef // X % undef -> undef if (match(Op1, m_Undef())) return Op1; // X / 0 -> undef // X % 0 -> undef // We don't need to preserve faults! if (match(Op1, m_Zero())) return UndefValue::get(Ty); // If any element of a constant divisor vector is zero or undef, the whole op // is undef. auto *Op1C = dyn_cast(Op1); if (Op1C && Ty->isVectorTy()) { unsigned NumElts = Ty->getVectorNumElements(); for (unsigned i = 0; i != NumElts; ++i) { Constant *Elt = Op1C->getAggregateElement(i); if (Elt && (Elt->isNullValue() || isa(Elt))) return UndefValue::get(Ty); } } // undef / X -> 0 // undef % X -> 0 if (match(Op0, m_Undef())) return Constant::getNullValue(Ty); // 0 / X -> 0 // 0 % X -> 0 if (match(Op0, m_Zero())) return Constant::getNullValue(Op0->getType()); // X / X -> 1 // X % X -> 0 if (Op0 == Op1) return IsDiv ? ConstantInt::get(Ty, 1) : Constant::getNullValue(Ty); // X / 1 -> X // X % 1 -> 0 // If this is a boolean op (single-bit element type), we can't have // division-by-zero or remainder-by-zero, so assume the divisor is 1. // Similarly, if we're zero-extending a boolean divisor, then assume it's a 1. Value *X; if (match(Op1, m_One()) || Ty->isIntOrIntVectorTy(1) || (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))) return IsDiv ? Op0 : Constant::getNullValue(Ty); return nullptr; } /// Given a predicate and two operands, return true if the comparison is true. /// This is a helper for div/rem simplification where we return some other value /// when we can prove a relationship between the operands. static bool isICmpTrue(ICmpInst::Predicate Pred, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { Value *V = SimplifyICmpInst(Pred, LHS, RHS, Q, MaxRecurse); Constant *C = dyn_cast_or_null(V); return (C && C->isAllOnesValue()); } /// Return true if we can simplify X / Y to 0. Remainder can adapt that answer /// to simplify X % Y to X. static bool isDivZero(Value *X, Value *Y, const SimplifyQuery &Q, unsigned MaxRecurse, bool IsSigned) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) return false; if (IsSigned) { // |X| / |Y| --> 0 // // We require that 1 operand is a simple constant. That could be extended to // 2 variables if we computed the sign bit for each. // // Make sure that a constant is not the minimum signed value because taking // the abs() of that is undefined. Type *Ty = X->getType(); const APInt *C; if (match(X, m_APInt(C)) && !C->isMinSignedValue()) { // Is the variable divisor magnitude always greater than the constant // dividend magnitude? // |Y| > |C| --> Y < -abs(C) or Y > abs(C) Constant *PosDividendC = ConstantInt::get(Ty, C->abs()); Constant *NegDividendC = ConstantInt::get(Ty, -C->abs()); if (isICmpTrue(CmpInst::ICMP_SLT, Y, NegDividendC, Q, MaxRecurse) || isICmpTrue(CmpInst::ICMP_SGT, Y, PosDividendC, Q, MaxRecurse)) return true; } if (match(Y, m_APInt(C))) { // Special-case: we can't take the abs() of a minimum signed value. If // that's the divisor, then all we have to do is prove that the dividend // is also not the minimum signed value. if (C->isMinSignedValue()) return isICmpTrue(CmpInst::ICMP_NE, X, Y, Q, MaxRecurse); // Is the variable dividend magnitude always less than the constant // divisor magnitude? // |X| < |C| --> X > -abs(C) and X < abs(C) Constant *PosDivisorC = ConstantInt::get(Ty, C->abs()); Constant *NegDivisorC = ConstantInt::get(Ty, -C->abs()); if (isICmpTrue(CmpInst::ICMP_SGT, X, NegDivisorC, Q, MaxRecurse) && isICmpTrue(CmpInst::ICMP_SLT, X, PosDivisorC, Q, MaxRecurse)) return true; } return false; } // IsSigned == false. // Is the dividend unsigned less than the divisor? return isICmpTrue(ICmpInst::ICMP_ULT, X, Y, Q, MaxRecurse); } /// These are simplifications common to SDiv and UDiv. static Value *simplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q)) return C; if (Value *V = simplifyDivRem(Op0, Op1, true)) return V; bool IsSigned = Opcode == Instruction::SDiv; // (X * Y) / Y -> X if the multiplication does not overflow. Value *X; if (match(Op0, m_c_Mul(m_Value(X), m_Specific(Op1)))) { auto *Mul = cast(Op0); // If the Mul does not overflow, then we are good to go. if ((IsSigned && Q.IIQ.hasNoSignedWrap(Mul)) || (!IsSigned && Q.IIQ.hasNoUnsignedWrap(Mul))) return X; // If X has the form X = A / Y, then X * Y cannot overflow. if ((IsSigned && match(X, m_SDiv(m_Value(), m_Specific(Op1)))) || (!IsSigned && match(X, m_UDiv(m_Value(), m_Specific(Op1))))) return X; } // (X rem Y) / Y -> 0 if ((IsSigned && match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) || (!IsSigned && match(Op0, m_URem(m_Value(), m_Specific(Op1))))) return Constant::getNullValue(Op0->getType()); // (X /u C1) /u C2 -> 0 if C1 * C2 overflow ConstantInt *C1, *C2; if (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_ConstantInt(C1))) && match(Op1, m_ConstantInt(C2))) { bool Overflow; (void)C1->getValue().umul_ov(C2->getValue(), Overflow); if (Overflow) return Constant::getNullValue(Op0->getType()); } // If the operation is with the result of a select instruction, check whether // operating on either branch of the select always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse)) return V; // If the operation is with the result of a phi instruction, check whether // operating on all incoming values of the phi always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse)) return V; if (isDivZero(Op0, Op1, Q, MaxRecurse, IsSigned)) return Constant::getNullValue(Op0->getType()); return nullptr; } /// These are simplifications common to SRem and URem. static Value *simplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q)) return C; if (Value *V = simplifyDivRem(Op0, Op1, false)) return V; // (X % Y) % Y -> X % Y if ((Opcode == Instruction::SRem && match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) || (Opcode == Instruction::URem && match(Op0, m_URem(m_Value(), m_Specific(Op1))))) return Op0; // (X << Y) % X -> 0 if (Q.IIQ.UseInstrInfo && ((Opcode == Instruction::SRem && match(Op0, m_NSWShl(m_Specific(Op1), m_Value()))) || (Opcode == Instruction::URem && match(Op0, m_NUWShl(m_Specific(Op1), m_Value()))))) return Constant::getNullValue(Op0->getType()); // If the operation is with the result of a select instruction, check whether // operating on either branch of the select always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse)) return V; // If the operation is with the result of a phi instruction, check whether // operating on all incoming values of the phi always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse)) return V; // If X / Y == 0, then X % Y == X. if (isDivZero(Op0, Op1, Q, MaxRecurse, Opcode == Instruction::SRem)) return Op0; return nullptr; } /// Given operands for an SDiv, see if we can fold the result. /// If not, this returns null. static Value *SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { // If two operands are negated and no signed overflow, return -1. if (isKnownNegation(Op0, Op1, /*NeedNSW=*/true)) return Constant::getAllOnesValue(Op0->getType()); return simplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse); } Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { return ::SimplifySDivInst(Op0, Op1, Q, RecursionLimit); } /// Given operands for a UDiv, see if we can fold the result. /// If not, this returns null. static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { return simplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse); } Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { return ::SimplifyUDivInst(Op0, Op1, Q, RecursionLimit); } /// Given operands for an SRem, see if we can fold the result. /// If not, this returns null. static Value *SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { // If the divisor is 0, the result is undefined, so assume the divisor is -1. // srem Op0, (sext i1 X) --> srem Op0, -1 --> 0 Value *X; if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) return ConstantInt::getNullValue(Op0->getType()); // If the two operands are negated, return 0. if (isKnownNegation(Op0, Op1)) return ConstantInt::getNullValue(Op0->getType()); return simplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse); } Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { return ::SimplifySRemInst(Op0, Op1, Q, RecursionLimit); } /// Given operands for a URem, see if we can fold the result. /// If not, this returns null. static Value *SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { return simplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse); } Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { return ::SimplifyURemInst(Op0, Op1, Q, RecursionLimit); } /// Returns true if a shift by \c Amount always yields undef. static bool isUndefShift(Value *Amount) { Constant *C = dyn_cast(Amount); if (!C) return false; // X shift by undef -> undef because it may shift by the bitwidth. if (isa(C)) return true; // Shifting by the bitwidth or more is undefined. if (ConstantInt *CI = dyn_cast(C)) if (CI->getValue().getLimitedValue() >= CI->getType()->getScalarSizeInBits()) return true; // If all lanes of a vector shift are undefined the whole shift is. if (isa(C) || isa(C)) { for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E; ++I) if (!isUndefShift(C->getAggregateElement(I))) return false; return true; } return false; } /// Given operands for an Shl, LShr or AShr, see if we can fold the result. /// If not, this returns null. static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q)) return C; // 0 shift by X -> 0 if (match(Op0, m_Zero())) return Constant::getNullValue(Op0->getType()); // X shift by 0 -> X // Shift-by-sign-extended bool must be shift-by-0 because shift-by-all-ones // would be poison. Value *X; if (match(Op1, m_Zero()) || (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))) return Op0; // Fold undefined shifts. if (isUndefShift(Op1)) return UndefValue::get(Op0->getType()); // If the operation is with the result of a select instruction, check whether // operating on either branch of the select always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse)) return V; // If the operation is with the result of a phi instruction, check whether // operating on all incoming values of the phi always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse)) return V; // If any bits in the shift amount make that value greater than or equal to // the number of bits in the type, the shift is undefined. KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (Known.One.getLimitedValue() >= Known.getBitWidth()) return UndefValue::get(Op0->getType()); // If all valid bits in the shift amount are known zero, the first operand is // unchanged. unsigned NumValidShiftBits = Log2_32_Ceil(Known.getBitWidth()); if (Known.countMinTrailingZeros() >= NumValidShiftBits) return Op0; return nullptr; } /// Given operands for an Shl, LShr or AShr, see if we can /// fold the result. If not, this returns null. static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, bool isExact, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyShift(Opcode, Op0, Op1, Q, MaxRecurse)) return V; // X >> X -> 0 if (Op0 == Op1) return Constant::getNullValue(Op0->getType()); // undef >> X -> 0 // undef >> X -> undef (if it's exact) if (match(Op0, m_Undef())) return isExact ? Op0 : Constant::getNullValue(Op0->getType()); // The low bit cannot be shifted out of an exact shift if it is set. if (isExact) { KnownBits Op0Known = computeKnownBits(Op0, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); if (Op0Known.One[0]) return Op0; } return nullptr; } /// Given operands for an Shl, see if we can fold the result. /// If not, this returns null. static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse)) return V; // undef << X -> 0 // undef << X -> undef if (if it's NSW/NUW) if (match(Op0, m_Undef())) return isNSW || isNUW ? Op0 : Constant::getNullValue(Op0->getType()); // (X >> A) << A -> X Value *X; if (Q.IIQ.UseInstrInfo && match(Op0, m_Exact(m_Shr(m_Value(X), m_Specific(Op1))))) return X; // shl nuw i8 C, %x -> C iff C has sign bit set. if (isNUW && match(Op0, m_Negative())) return Op0; // NOTE: could use computeKnownBits() / LazyValueInfo, // but the cost-benefit analysis suggests it isn't worth it. return nullptr; } Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const SimplifyQuery &Q) { return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit); } /// Given operands for an LShr, see if we can fold the result. /// If not, this returns null. static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q, MaxRecurse)) return V; // (X << A) >> A -> X Value *X; if (match(Op0, m_NUWShl(m_Value(X), m_Specific(Op1)))) return X; // ((X << A) | Y) >> A -> X if effective width of Y is not larger than A. // We can return X as we do in the above case since OR alters no bits in X. // SimplifyDemandedBits in InstCombine can do more general optimization for // bit manipulation. This pattern aims to provide opportunities for other // optimizers by supporting a simple but common case in InstSimplify. Value *Y; const APInt *ShRAmt, *ShLAmt; if (match(Op1, m_APInt(ShRAmt)) && match(Op0, m_c_Or(m_NUWShl(m_Value(X), m_APInt(ShLAmt)), m_Value(Y))) && *ShRAmt == *ShLAmt) { const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); const unsigned Width = Op0->getType()->getScalarSizeInBits(); const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros(); if (ShRAmt->uge(EffWidthY)) return X; } return nullptr; } Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, const SimplifyQuery &Q) { return ::SimplifyLShrInst(Op0, Op1, isExact, Q, RecursionLimit); } /// Given operands for an AShr, see if we can fold the result. /// If not, this returns null. static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q, MaxRecurse)) return V; // all ones >>a X -> -1 // Do not return Op0 because it may contain undef elements if it's a vector. if (match(Op0, m_AllOnes())) return Constant::getAllOnesValue(Op0->getType()); // (X << A) >> A -> X Value *X; if (Q.IIQ.UseInstrInfo && match(Op0, m_NSWShl(m_Value(X), m_Specific(Op1)))) return X; // Arithmetic shifting an all-sign-bit value is a no-op. unsigned NumSignBits = ComputeNumSignBits(Op0, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (NumSignBits == Op0->getType()->getScalarSizeInBits()) return Op0; return nullptr; } Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, const SimplifyQuery &Q) { return ::SimplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit); } /// Commuted variants are assumed to be handled by calling this function again /// with the parameters swapped. static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp, ICmpInst *UnsignedICmp, bool IsAnd) { Value *X, *Y; ICmpInst::Predicate EqPred; if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(Y), m_Zero())) || !ICmpInst::isEquality(EqPred)) return nullptr; ICmpInst::Predicate UnsignedPred; if (match(UnsignedICmp, m_ICmp(UnsignedPred, m_Value(X), m_Specific(Y))) && ICmpInst::isUnsigned(UnsignedPred)) ; else if (match(UnsignedICmp, m_ICmp(UnsignedPred, m_Specific(Y), m_Value(X))) && ICmpInst::isUnsigned(UnsignedPred)) UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred); else return nullptr; // X < Y && Y != 0 --> X < Y // X < Y || Y != 0 --> Y != 0 if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE) return IsAnd ? UnsignedICmp : ZeroICmp; // X >= Y || Y != 0 --> true // X >= Y || Y == 0 --> X >= Y if (UnsignedPred == ICmpInst::ICMP_UGE && !IsAnd) { if (EqPred == ICmpInst::ICMP_NE) return getTrue(UnsignedICmp->getType()); return UnsignedICmp; } // X < Y && Y == 0 --> false if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_EQ && IsAnd) return getFalse(UnsignedICmp->getType()); return nullptr; } /// Commuted variants are assumed to be handled by calling this function again /// with the parameters swapped. static Value *simplifyAndOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) { ICmpInst::Predicate Pred0, Pred1; Value *A ,*B; if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) || !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B)))) return nullptr; // We have (icmp Pred0, A, B) & (icmp Pred1, A, B). // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we // can eliminate Op1 from this 'and'. if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1)) return Op0; // Check for any combination of predicates that are guaranteed to be disjoint. if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) || (Pred0 == ICmpInst::ICMP_EQ && ICmpInst::isFalseWhenEqual(Pred1)) || (Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT) || (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT)) return getFalse(Op0->getType()); return nullptr; } /// Commuted variants are assumed to be handled by calling this function again /// with the parameters swapped. static Value *simplifyOrOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) { ICmpInst::Predicate Pred0, Pred1; Value *A ,*B; if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) || !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B)))) return nullptr; // We have (icmp Pred0, A, B) | (icmp Pred1, A, B). // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we // can eliminate Op0 from this 'or'. if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1)) return Op1; // Check for any combination of predicates that cover the entire range of // possibilities. if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) || (Pred0 == ICmpInst::ICMP_NE && ICmpInst::isTrueWhenEqual(Pred1)) || (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGE) || (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGE)) return getTrue(Op0->getType()); return nullptr; } /// Test if a pair of compares with a shared operand and 2 constants has an /// empty set intersection, full set union, or if one compare is a superset of /// the other. static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd) { // Look for this pattern: {and/or} (icmp X, C0), (icmp X, C1)). if (Cmp0->getOperand(0) != Cmp1->getOperand(0)) return nullptr; const APInt *C0, *C1; if (!match(Cmp0->getOperand(1), m_APInt(C0)) || !match(Cmp1->getOperand(1), m_APInt(C1))) return nullptr; auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0); auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1); // For and-of-compares, check if the intersection is empty: // (icmp X, C0) && (icmp X, C1) --> empty set --> false if (IsAnd && Range0.intersectWith(Range1).isEmptySet()) return getFalse(Cmp0->getType()); // For or-of-compares, check if the union is full: // (icmp X, C0) || (icmp X, C1) --> full set --> true if (!IsAnd && Range0.unionWith(Range1).isFullSet()) return getTrue(Cmp0->getType()); // Is one range a superset of the other? // If this is and-of-compares, take the smaller set: // (icmp sgt X, 4) && (icmp sgt X, 42) --> icmp sgt X, 42 // If this is or-of-compares, take the larger set: // (icmp sgt X, 4) || (icmp sgt X, 42) --> icmp sgt X, 4 if (Range0.contains(Range1)) return IsAnd ? Cmp1 : Cmp0; if (Range1.contains(Range0)) return IsAnd ? Cmp0 : Cmp1; return nullptr; } static Value *simplifyAndOrOfICmpsWithZero(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd) { ICmpInst::Predicate P0 = Cmp0->getPredicate(), P1 = Cmp1->getPredicate(); if (!match(Cmp0->getOperand(1), m_Zero()) || !match(Cmp1->getOperand(1), m_Zero()) || P0 != P1) return nullptr; if ((IsAnd && P0 != ICmpInst::ICMP_NE) || (!IsAnd && P1 != ICmpInst::ICMP_EQ)) return nullptr; // We have either "(X == 0 || Y == 0)" or "(X != 0 && Y != 0)". Value *X = Cmp0->getOperand(0); Value *Y = Cmp1->getOperand(0); // If one of the compares is a masked version of a (not) null check, then // that compare implies the other, so we eliminate the other. Optionally, look // through a pointer-to-int cast to match a null check of a pointer type. // (X == 0) || (([ptrtoint] X & ?) == 0) --> ([ptrtoint] X & ?) == 0 // (X == 0) || ((? & [ptrtoint] X) == 0) --> (? & [ptrtoint] X) == 0 // (X != 0) && (([ptrtoint] X & ?) != 0) --> ([ptrtoint] X & ?) != 0 // (X != 0) && ((? & [ptrtoint] X) != 0) --> (? & [ptrtoint] X) != 0 if (match(Y, m_c_And(m_Specific(X), m_Value())) || match(Y, m_c_And(m_PtrToInt(m_Specific(X)), m_Value()))) return Cmp1; // (([ptrtoint] Y & ?) == 0) || (Y == 0) --> ([ptrtoint] Y & ?) == 0 // ((? & [ptrtoint] Y) == 0) || (Y == 0) --> (? & [ptrtoint] Y) == 0 // (([ptrtoint] Y & ?) != 0) && (Y != 0) --> ([ptrtoint] Y & ?) != 0 // ((? & [ptrtoint] Y) != 0) && (Y != 0) --> (? & [ptrtoint] Y) != 0 if (match(X, m_c_And(m_Specific(Y), m_Value())) || match(X, m_c_And(m_PtrToInt(m_Specific(Y)), m_Value()))) return Cmp0; return nullptr; } static Value *simplifyAndOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1, const InstrInfoQuery &IIQ) { // (icmp (add V, C0), C1) & (icmp V, C0) ICmpInst::Predicate Pred0, Pred1; const APInt *C0, *C1; Value *V; if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1)))) return nullptr; if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value()))) return nullptr; auto *AddInst = cast(Op0->getOperand(0)); if (AddInst->getOperand(1) != Op1->getOperand(1)) return nullptr; Type *ITy = Op0->getType(); bool isNSW = IIQ.hasNoSignedWrap(AddInst); bool isNUW = IIQ.hasNoUnsignedWrap(AddInst); const APInt Delta = *C1 - *C0; if (C0->isStrictlyPositive()) { if (Delta == 2) { if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_SGT) return getFalse(ITy); if (Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT && isNSW) return getFalse(ITy); } if (Delta == 1) { if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_SGT) return getFalse(ITy); if (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGT && isNSW) return getFalse(ITy); } } if (C0->getBoolValue() && isNUW) { if (Delta == 2) if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT) return getFalse(ITy); if (Delta == 1) if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGT) return getFalse(ITy); } return nullptr; } static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1, const InstrInfoQuery &IIQ) { if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true)) return X; if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/true)) return X; if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1)) return X; if (Value *X = simplifyAndOfICmpsWithSameOperands(Op1, Op0)) return X; if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true)) return X; if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, true)) return X; if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1, IIQ)) return X; if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0, IIQ)) return X; return nullptr; } static Value *simplifyOrOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1, const InstrInfoQuery &IIQ) { // (icmp (add V, C0), C1) | (icmp V, C0) ICmpInst::Predicate Pred0, Pred1; const APInt *C0, *C1; Value *V; if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1)))) return nullptr; if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value()))) return nullptr; auto *AddInst = cast(Op0->getOperand(0)); if (AddInst->getOperand(1) != Op1->getOperand(1)) return nullptr; Type *ITy = Op0->getType(); bool isNSW = IIQ.hasNoSignedWrap(AddInst); bool isNUW = IIQ.hasNoUnsignedWrap(AddInst); const APInt Delta = *C1 - *C0; if (C0->isStrictlyPositive()) { if (Delta == 2) { if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_SLE) return getTrue(ITy); if (Pred0 == ICmpInst::ICMP_SGE && Pred1 == ICmpInst::ICMP_SLE && isNSW) return getTrue(ITy); } if (Delta == 1) { if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_SLE) return getTrue(ITy); if (Pred0 == ICmpInst::ICMP_SGT && Pred1 == ICmpInst::ICMP_SLE && isNSW) return getTrue(ITy); } } if (C0->getBoolValue() && isNUW) { if (Delta == 2) if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_ULE) return getTrue(ITy); if (Delta == 1) if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_ULE) return getTrue(ITy); } return nullptr; } static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1, const InstrInfoQuery &IIQ) { if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false)) return X; if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/false)) return X; if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1)) return X; if (Value *X = simplifyOrOfICmpsWithSameOperands(Op1, Op0)) return X; if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false)) return X; if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, false)) return X; if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1, IIQ)) return X; if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0, IIQ)) return X; return nullptr; } static Value *simplifyAndOrOfFCmps(const TargetLibraryInfo *TLI, FCmpInst *LHS, FCmpInst *RHS, bool IsAnd) { Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1); Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1); if (LHS0->getType() != RHS0->getType()) return nullptr; FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) || (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) { // (fcmp ord NNAN, X) & (fcmp ord X, Y) --> fcmp ord X, Y // (fcmp ord NNAN, X) & (fcmp ord Y, X) --> fcmp ord Y, X // (fcmp ord X, NNAN) & (fcmp ord X, Y) --> fcmp ord X, Y // (fcmp ord X, NNAN) & (fcmp ord Y, X) --> fcmp ord Y, X // (fcmp uno NNAN, X) | (fcmp uno X, Y) --> fcmp uno X, Y // (fcmp uno NNAN, X) | (fcmp uno Y, X) --> fcmp uno Y, X // (fcmp uno X, NNAN) | (fcmp uno X, Y) --> fcmp uno X, Y // (fcmp uno X, NNAN) | (fcmp uno Y, X) --> fcmp uno Y, X if ((isKnownNeverNaN(LHS0, TLI) && (LHS1 == RHS0 || LHS1 == RHS1)) || (isKnownNeverNaN(LHS1, TLI) && (LHS0 == RHS0 || LHS0 == RHS1))) return RHS; // (fcmp ord X, Y) & (fcmp ord NNAN, X) --> fcmp ord X, Y // (fcmp ord Y, X) & (fcmp ord NNAN, X) --> fcmp ord Y, X // (fcmp ord X, Y) & (fcmp ord X, NNAN) --> fcmp ord X, Y // (fcmp ord Y, X) & (fcmp ord X, NNAN) --> fcmp ord Y, X // (fcmp uno X, Y) | (fcmp uno NNAN, X) --> fcmp uno X, Y // (fcmp uno Y, X) | (fcmp uno NNAN, X) --> fcmp uno Y, X // (fcmp uno X, Y) | (fcmp uno X, NNAN) --> fcmp uno X, Y // (fcmp uno Y, X) | (fcmp uno X, NNAN) --> fcmp uno Y, X if ((isKnownNeverNaN(RHS0, TLI) && (RHS1 == LHS0 || RHS1 == LHS1)) || (isKnownNeverNaN(RHS1, TLI) && (RHS0 == LHS0 || RHS0 == LHS1))) return LHS; } return nullptr; } static Value *simplifyAndOrOfCmps(const SimplifyQuery &Q, Value *Op0, Value *Op1, bool IsAnd) { // Look through casts of the 'and' operands to find compares. auto *Cast0 = dyn_cast(Op0); auto *Cast1 = dyn_cast(Op1); if (Cast0 && Cast1 && Cast0->getOpcode() == Cast1->getOpcode() && Cast0->getSrcTy() == Cast1->getSrcTy()) { Op0 = Cast0->getOperand(0); Op1 = Cast1->getOperand(0); } Value *V = nullptr; auto *ICmp0 = dyn_cast(Op0); auto *ICmp1 = dyn_cast(Op1); if (ICmp0 && ICmp1) V = IsAnd ? simplifyAndOfICmps(ICmp0, ICmp1, Q.IIQ) : simplifyOrOfICmps(ICmp0, ICmp1, Q.IIQ); auto *FCmp0 = dyn_cast(Op0); auto *FCmp1 = dyn_cast(Op1); if (FCmp0 && FCmp1) V = simplifyAndOrOfFCmps(Q.TLI, FCmp0, FCmp1, IsAnd); if (!V) return nullptr; if (!Cast0) return V; // If we looked through casts, we can only handle a constant simplification // because we are not allowed to create a cast instruction here. if (auto *C = dyn_cast(V)) return ConstantExpr::getCast(Cast0->getOpcode(), C, Cast0->getType()); return nullptr; } /// Given operands for an And, see if we can fold the result. /// If not, this returns null. static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::And, Op0, Op1, Q)) return C; // X & undef -> 0 if (match(Op1, m_Undef())) return Constant::getNullValue(Op0->getType()); // X & X = X if (Op0 == Op1) return Op0; // X & 0 = 0 if (match(Op1, m_Zero())) return Constant::getNullValue(Op0->getType()); // X & -1 = X if (match(Op1, m_AllOnes())) return Op0; // A & ~A = ~A & A = 0 if (match(Op0, m_Not(m_Specific(Op1))) || match(Op1, m_Not(m_Specific(Op0)))) return Constant::getNullValue(Op0->getType()); // (A | ?) & A = A if (match(Op0, m_c_Or(m_Specific(Op1), m_Value()))) return Op1; // A & (A | ?) = A if (match(Op1, m_c_Or(m_Specific(Op0), m_Value()))) return Op0; // A mask that only clears known zeros of a shifted value is a no-op. Value *X; const APInt *Mask; const APInt *ShAmt; if (match(Op1, m_APInt(Mask))) { // If all bits in the inverted and shifted mask are clear: // and (shl X, ShAmt), Mask --> shl X, ShAmt if (match(Op0, m_Shl(m_Value(X), m_APInt(ShAmt))) && (~(*Mask)).lshr(*ShAmt).isNullValue()) return Op0; // If all bits in the inverted and shifted mask are clear: // and (lshr X, ShAmt), Mask --> lshr X, ShAmt if (match(Op0, m_LShr(m_Value(X), m_APInt(ShAmt))) && (~(*Mask)).shl(*ShAmt).isNullValue()) return Op0; } // A & (-A) = A if A is a power of two or zero. if (match(Op0, m_Neg(m_Specific(Op1))) || match(Op1, m_Neg(m_Specific(Op0)))) { if (isKnownToBeAPowerOfTwo(Op0, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI, Q.DT)) return Op0; if (isKnownToBeAPowerOfTwo(Op1, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI, Q.DT)) return Op1; } // This is a similar pattern used for checking if a value is a power-of-2: // (A - 1) & A --> 0 (if A is a power-of-2 or 0) // A & (A - 1) --> 0 (if A is a power-of-2 or 0) if (match(Op0, m_Add(m_Specific(Op1), m_AllOnes())) && isKnownToBeAPowerOfTwo(Op1, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI, Q.DT)) return Constant::getNullValue(Op1->getType()); if (match(Op1, m_Add(m_Specific(Op0), m_AllOnes())) && isKnownToBeAPowerOfTwo(Op0, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI, Q.DT)) return Constant::getNullValue(Op0->getType()); if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, true)) return V; // Try some generic simplifications for associative operations. if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q, MaxRecurse)) return V; // And distributes over Or. Try some generic simplifications based on this. if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Or, Q, MaxRecurse)) return V; // And distributes over Xor. Try some generic simplifications based on this. if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Xor, Q, MaxRecurse)) return V; // If the operation is with the result of a select instruction, check whether // operating on either branch of the select always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverSelect(Instruction::And, Op0, Op1, Q, MaxRecurse)) return V; // If the operation is with the result of a phi instruction, check whether // operating on all incoming values of the phi always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverPHI(Instruction::And, Op0, Op1, Q, MaxRecurse)) return V; // Assuming the effective width of Y is not larger than A, i.e. all bits // from X and Y are disjoint in (X << A) | Y, // if the mask of this AND op covers all bits of X or Y, while it covers // no bits from the other, we can bypass this AND op. E.g., // ((X << A) | Y) & Mask -> Y, // if Mask = ((1 << effective_width_of(Y)) - 1) // ((X << A) | Y) & Mask -> X << A, // if Mask = ((1 << effective_width_of(X)) - 1) << A // SimplifyDemandedBits in InstCombine can optimize the general case. // This pattern aims to help other passes for a common case. Value *Y, *XShifted; if (match(Op1, m_APInt(Mask)) && match(Op0, m_c_Or(m_CombineAnd(m_NUWShl(m_Value(X), m_APInt(ShAmt)), m_Value(XShifted)), m_Value(Y)))) { const unsigned Width = Op0->getType()->getScalarSizeInBits(); const unsigned ShftCnt = ShAmt->getLimitedValue(Width); const KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); const unsigned EffWidthY = Width - YKnown.countMinLeadingZeros(); if (EffWidthY <= ShftCnt) { const KnownBits XKnown = computeKnownBits(X, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); const unsigned EffWidthX = Width - XKnown.countMinLeadingZeros(); const APInt EffBitsY = APInt::getLowBitsSet(Width, EffWidthY); const APInt EffBitsX = APInt::getLowBitsSet(Width, EffWidthX) << ShftCnt; // If the mask is extracting all bits from X or Y as is, we can skip // this AND op. if (EffBitsY.isSubsetOf(*Mask) && !EffBitsX.intersects(*Mask)) return Y; if (EffBitsX.isSubsetOf(*Mask) && !EffBitsY.intersects(*Mask)) return XShifted; } } return nullptr; } Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { return ::SimplifyAndInst(Op0, Op1, Q, RecursionLimit); } /// Given operands for an Or, see if we can fold the result. /// If not, this returns null. static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Or, Op0, Op1, Q)) return C; // X | undef -> -1 // X | -1 = -1 // Do not return Op1 because it may contain undef elements if it's a vector. if (match(Op1, m_Undef()) || match(Op1, m_AllOnes())) return Constant::getAllOnesValue(Op0->getType()); // X | X = X // X | 0 = X if (Op0 == Op1 || match(Op1, m_Zero())) return Op0; // A | ~A = ~A | A = -1 if (match(Op0, m_Not(m_Specific(Op1))) || match(Op1, m_Not(m_Specific(Op0)))) return Constant::getAllOnesValue(Op0->getType()); // (A & ?) | A = A if (match(Op0, m_c_And(m_Specific(Op1), m_Value()))) return Op1; // A | (A & ?) = A if (match(Op1, m_c_And(m_Specific(Op0), m_Value()))) return Op0; // ~(A & ?) | A = -1 if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value())))) return Constant::getAllOnesValue(Op1->getType()); // A | ~(A & ?) = -1 if (match(Op1, m_Not(m_c_And(m_Specific(Op1), m_Value())))) return Constant::getAllOnesValue(Op0->getType()); Value *A, *B; // (A & ~B) | (A ^ B) -> (A ^ B) // (~B & A) | (A ^ B) -> (A ^ B) // (A & ~B) | (B ^ A) -> (B ^ A) // (~B & A) | (B ^ A) -> (B ^ A) if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (match(Op0, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) || match(Op0, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))) return Op1; // Commute the 'or' operands. // (A ^ B) | (A & ~B) -> (A ^ B) // (A ^ B) | (~B & A) -> (A ^ B) // (B ^ A) | (A & ~B) -> (B ^ A) // (B ^ A) | (~B & A) -> (B ^ A) if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && (match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) || match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))) return Op0; // (A & B) | (~A ^ B) -> (~A ^ B) // (B & A) | (~A ^ B) -> (~A ^ B) // (A & B) | (B ^ ~A) -> (B ^ ~A) // (B & A) | (B ^ ~A) -> (B ^ ~A) if (match(Op0, m_And(m_Value(A), m_Value(B))) && (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) || match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B))))) return Op1; // (~A ^ B) | (A & B) -> (~A ^ B) // (~A ^ B) | (B & A) -> (~A ^ B) // (B ^ ~A) | (A & B) -> (B ^ ~A) // (B ^ ~A) | (B & A) -> (B ^ ~A) if (match(Op1, m_And(m_Value(A), m_Value(B))) && (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) || match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B))))) return Op0; if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, false)) return V; // Try some generic simplifications for associative operations. if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q, MaxRecurse)) return V; // Or distributes over And. Try some generic simplifications based on this. if (Value *V = ExpandBinOp(Instruction::Or, Op0, Op1, Instruction::And, Q, MaxRecurse)) return V; // If the operation is with the result of a select instruction, check whether // operating on either branch of the select always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverSelect(Instruction::Or, Op0, Op1, Q, MaxRecurse)) return V; // (A & C1)|(B & C2) const APInt *C1, *C2; if (match(Op0, m_And(m_Value(A), m_APInt(C1))) && match(Op1, m_And(m_Value(B), m_APInt(C2)))) { if (*C1 == ~*C2) { // (A & C1)|(B & C2) // If we have: ((V + N) & C1) | (V & C2) // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 // replace with V+N. Value *N; if (C2->isMask() && // C2 == 0+1+ match(A, m_c_Add(m_Specific(B), m_Value(N)))) { // Add commutes, try both ways. if (MaskedValueIsZero(N, *C2, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) return A; } // Or commutes, try both ways. if (C1->isMask() && match(B, m_c_Add(m_Specific(A), m_Value(N)))) { // Add commutes, try both ways. if (MaskedValueIsZero(N, *C1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) return B; } } } // If the operation is with the result of a phi instruction, check whether // operating on all incoming values of the phi always yields the same value. if (isa(Op0) || isa(Op1)) if (Value *V = ThreadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse)) return V; return nullptr; } Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { return ::SimplifyOrInst(Op0, Op1, Q, RecursionLimit); } /// Given operands for a Xor, see if we can fold the result. /// If not, this returns null. static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Xor, Op0, Op1, Q)) return C; // A ^ undef -> undef if (match(Op1, m_Undef())) return Op1; // A ^ 0 = A if (match(Op1, m_Zero())) return Op0; // A ^ A = 0 if (Op0 == Op1) return Constant::getNullValue(Op0->getType()); // A ^ ~A = ~A ^ A = -1 if (match(Op0, m_Not(m_Specific(Op1))) || match(Op1, m_Not(m_Specific(Op0)))) return Constant::getAllOnesValue(Op0->getType()); // Try some generic simplifications for associative operations. if (Value *V = SimplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, Q, MaxRecurse)) return V; // Threading Xor over selects and phi nodes is pointless, so don't bother. // Threading over the select in "A ^ select(cond, B, C)" means evaluating // "A^B" and "A^C" and seeing if they are equal; but they are equal if and // only if B and C are equal. If B and C are equal then (since we assume // that operands have already been simplified) "select(cond, B, C)" should // have been simplified to the common value of B and C already. Analysing // "A^B" and "A^C" thus gains nothing, but costs compile time. Similarly // for threading over phi nodes. return nullptr; } Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { return ::SimplifyXorInst(Op0, Op1, Q, RecursionLimit); } static Type *GetCompareTy(Value *Op) { return CmpInst::makeCmpResultType(Op->getType()); } /// Rummage around inside V looking for something equivalent to the comparison /// "LHS Pred RHS". Return such a value if found, otherwise return null. /// Helper function for analyzing max/min idioms. static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred, Value *LHS, Value *RHS) { SelectInst *SI = dyn_cast(V); if (!SI) return nullptr; CmpInst *Cmp = dyn_cast(SI->getCondition()); if (!Cmp) return nullptr; Value *CmpLHS = Cmp->getOperand(0), *CmpRHS = Cmp->getOperand(1); if (Pred == Cmp->getPredicate() && LHS == CmpLHS && RHS == CmpRHS) return Cmp; if (Pred == CmpInst::getSwappedPredicate(Cmp->getPredicate()) && LHS == CmpRHS && RHS == CmpLHS) return Cmp; return nullptr; } // A significant optimization not implemented here is assuming that alloca // addresses are not equal to incoming argument values. They don't *alias*, // as we say, but that doesn't mean they aren't equal, so we take a // conservative approach. // // This is inspired in part by C++11 5.10p1: // "Two pointers of the same type compare equal if and only if they are both // null, both point to the same function, or both represent the same // address." // // This is pretty permissive. // // It's also partly due to C11 6.5.9p6: // "Two pointers compare equal if and only if both are null pointers, both are // pointers to the same object (including a pointer to an object and a // subobject at its beginning) or function, both are pointers to one past the // last element of the same array object, or one is a pointer to one past the // end of one array object and the other is a pointer to the start of a // different array object that happens to immediately follow the first array // object in the address space.) // // C11's version is more restrictive, however there's no reason why an argument // couldn't be a one-past-the-end value for a stack object in the caller and be // equal to the beginning of a stack object in the callee. // // If the C and C++ standards are ever made sufficiently restrictive in this // area, it may be possible to update LLVM's semantics accordingly and reinstate // this optimization. static Constant * computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, CmpInst::Predicate Pred, AssumptionCache *AC, const Instruction *CxtI, const InstrInfoQuery &IIQ, Value *LHS, Value *RHS) { // First, skip past any trivial no-ops. LHS = LHS->stripPointerCasts(); RHS = RHS->stripPointerCasts(); // A non-null pointer is not equal to a null pointer. if (llvm::isKnownNonZero(LHS, DL, 0, nullptr, nullptr, nullptr, IIQ.UseInstrInfo) && isa(RHS) && (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE)) return ConstantInt::get(GetCompareTy(LHS), !CmpInst::isTrueWhenEqual(Pred)); // We can only fold certain predicates on pointer comparisons. switch (Pred) { default: return nullptr; // Equality comaprisons are easy to fold. case CmpInst::ICMP_EQ: case CmpInst::ICMP_NE: break; // We can only handle unsigned relational comparisons because 'inbounds' on // a GEP only protects against unsigned wrapping. case CmpInst::ICMP_UGT: case CmpInst::ICMP_UGE: case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE: // However, we have to switch them to their signed variants to handle // negative indices from the base pointer. Pred = ICmpInst::getSignedPredicate(Pred); break; } // Strip off any constant offsets so that we can reason about them. // It's tempting to use getUnderlyingObject or even just stripInBoundsOffsets // here and compare base addresses like AliasAnalysis does, however there are // numerous hazards. AliasAnalysis and its utilities rely on special rules // governing loads and stores which don't apply to icmps. Also, AliasAnalysis // doesn't need to guarantee pointer inequality when it says NoAlias. Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS); Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS); // If LHS and RHS are related via constant offsets to the same base // value, we can replace it with an icmp which just compares the offsets. if (LHS == RHS) return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset); // Various optimizations for (in)equality comparisons. if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) { // Different non-empty allocations that exist at the same time have // different addresses (if the program can tell). Global variables always // exist, so they always exist during the lifetime of each other and all // allocas. Two different allocas usually have different addresses... // // However, if there's an @llvm.stackrestore dynamically in between two // allocas, they may have the same address. It's tempting to reduce the // scope of the problem by only looking at *static* allocas here. That would // cover the majority of allocas while significantly reducing the likelihood // of having an @llvm.stackrestore pop up in the middle. However, it's not // actually impossible for an @llvm.stackrestore to pop up in the middle of // an entry block. Also, if we have a block that's not attached to a // function, we can't tell if it's "static" under the current definition. // Theoretically, this problem could be fixed by creating a new kind of // instruction kind specifically for static allocas. Such a new instruction // could be required to be at the top of the entry block, thus preventing it // from being subject to a @llvm.stackrestore. Instcombine could even // convert regular allocas into these special allocas. It'd be nifty. // However, until then, this problem remains open. // // So, we'll assume that two non-empty allocas have different addresses // for now. // // With all that, if the offsets are within the bounds of their allocations // (and not one-past-the-end! so we can't use inbounds!), and their // allocations aren't the same, the pointers are not equal. // // Note that it's not necessary to check for LHS being a global variable // address, due to canonicalization and constant folding. if (isa(LHS) && (isa(RHS) || isa(RHS))) { ConstantInt *LHSOffsetCI = dyn_cast(LHSOffset); ConstantInt *RHSOffsetCI = dyn_cast(RHSOffset); uint64_t LHSSize, RHSSize; ObjectSizeOpts Opts; Opts.NullIsUnknownSize = NullPointerIsDefined(cast(LHS)->getFunction()); if (LHSOffsetCI && RHSOffsetCI && getObjectSize(LHS, LHSSize, DL, TLI, Opts) && getObjectSize(RHS, RHSSize, DL, TLI, Opts)) { const APInt &LHSOffsetValue = LHSOffsetCI->getValue(); const APInt &RHSOffsetValue = RHSOffsetCI->getValue(); if (!LHSOffsetValue.isNegative() && !RHSOffsetValue.isNegative() && LHSOffsetValue.ult(LHSSize) && RHSOffsetValue.ult(RHSSize)) { return ConstantInt::get(GetCompareTy(LHS), !CmpInst::isTrueWhenEqual(Pred)); } } // Repeat the above check but this time without depending on DataLayout // or being able to compute a precise size. if (!cast(LHS->getType())->isEmptyTy() && !cast(RHS->getType())->isEmptyTy() && LHSOffset->isNullValue() && RHSOffset->isNullValue()) return ConstantInt::get(GetCompareTy(LHS), !CmpInst::isTrueWhenEqual(Pred)); } // Even if an non-inbounds GEP occurs along the path we can still optimize // equality comparisons concerning the result. We avoid walking the whole // chain again by starting where the last calls to // stripAndComputeConstantOffsets left off and accumulate the offsets. Constant *LHSNoBound = stripAndComputeConstantOffsets(DL, LHS, true); Constant *RHSNoBound = stripAndComputeConstantOffsets(DL, RHS, true); if (LHS == RHS) return ConstantExpr::getICmp(Pred, ConstantExpr::getAdd(LHSOffset, LHSNoBound), ConstantExpr::getAdd(RHSOffset, RHSNoBound)); // If one side of the equality comparison must come from a noalias call // (meaning a system memory allocation function), and the other side must // come from a pointer that cannot overlap with dynamically-allocated // memory within the lifetime of the current function (allocas, byval // arguments, globals), then determine the comparison result here. SmallVector LHSUObjs, RHSUObjs; GetUnderlyingObjects(LHS, LHSUObjs, DL); GetUnderlyingObjects(RHS, RHSUObjs, DL); // Is the set of underlying objects all noalias calls? auto IsNAC = [](ArrayRef Objects) { return all_of(Objects, isNoAliasCall); }; // Is the set of underlying objects all things which must be disjoint from // noalias calls. For allocas, we consider only static ones (dynamic // allocas might be transformed into calls to malloc not simultaneously // live with the compared-to allocation). For globals, we exclude symbols // that might be resolve lazily to symbols in another dynamically-loaded // library (and, thus, could be malloc'ed by the implementation). auto IsAllocDisjoint = [](ArrayRef Objects) { return all_of(Objects, [](const Value *V) { if (const AllocaInst *AI = dyn_cast(V)) return AI->getParent() && AI->getFunction() && AI->isStaticAlloca(); if (const GlobalValue *GV = dyn_cast(V)) return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() || GV->hasProtectedVisibility() || GV->hasGlobalUnnamedAddr()) && !GV->isThreadLocal(); if (const Argument *A = dyn_cast(V)) return A->hasByValAttr(); return false; }); }; if ((IsNAC(LHSUObjs) && IsAllocDisjoint(RHSUObjs)) || (IsNAC(RHSUObjs) && IsAllocDisjoint(LHSUObjs))) return ConstantInt::get(GetCompareTy(LHS), !CmpInst::isTrueWhenEqual(Pred)); // Fold comparisons for non-escaping pointer even if the allocation call // cannot be elided. We cannot fold malloc comparison to null. Also, the // dynamic allocation call could be either of the operands. Value *MI = nullptr; if (isAllocLikeFn(LHS, TLI) && llvm::isKnownNonZero(RHS, DL, 0, nullptr, CxtI, DT)) MI = LHS; else if (isAllocLikeFn(RHS, TLI) && llvm::isKnownNonZero(LHS, DL, 0, nullptr, CxtI, DT)) MI = RHS; // FIXME: We should also fold the compare when the pointer escapes, but the // compare dominates the pointer escape if (MI && !PointerMayBeCaptured(MI, true, true)) return ConstantInt::get(GetCompareTy(LHS), CmpInst::isFalseWhenEqual(Pred)); } // Otherwise, fail. return nullptr; } /// Fold an icmp when its operands have i1 scalar type. static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const SimplifyQuery &Q) { Type *ITy = GetCompareTy(LHS); // The return type. Type *OpTy = LHS->getType(); // The operand type. if (!OpTy->isIntOrIntVectorTy(1)) return nullptr; // A boolean compared to true/false can be simplified in 14 out of the 20 // (10 predicates * 2 constants) possible combinations. Cases not handled here // require a 'not' of the LHS, so those must be transformed in InstCombine. if (match(RHS, m_Zero())) { switch (Pred) { case CmpInst::ICMP_NE: // X != 0 -> X case CmpInst::ICMP_UGT: // X >u 0 -> X case CmpInst::ICMP_SLT: // X X return LHS; case CmpInst::ICMP_ULT: // X false case CmpInst::ICMP_SGT: // X >s 0 -> false return getFalse(ITy); case CmpInst::ICMP_UGE: // X >=u 0 -> true case CmpInst::ICMP_SLE: // X <=s 0 -> true return getTrue(ITy); default: break; } } else if (match(RHS, m_One())) { switch (Pred) { case CmpInst::ICMP_EQ: // X == 1 -> X case CmpInst::ICMP_UGE: // X >=u 1 -> X case CmpInst::ICMP_SLE: // X <=s -1 -> X return LHS; case CmpInst::ICMP_UGT: // X >u 1 -> false case CmpInst::ICMP_SLT: // X false return getFalse(ITy); case CmpInst::ICMP_ULE: // X <=u 1 -> true case CmpInst::ICMP_SGE: // X >=s -1 -> true return getTrue(ITy); default: break; } } switch (Pred) { default: break; case ICmpInst::ICMP_UGE: if (isImpliedCondition(RHS, LHS, Q.DL).getValueOr(false)) return getTrue(ITy); break; case ICmpInst::ICMP_SGE: /// For signed comparison, the values for an i1 are 0 and -1 /// respectively. This maps into a truth table of: /// LHS | RHS | LHS >=s RHS | LHS implies RHS /// 0 | 0 | 1 (0 >= 0) | 1 /// 0 | 1 | 1 (0 >= -1) | 1 /// 1 | 0 | 0 (-1 >= 0) | 0 /// 1 | 1 | 1 (-1 >= -1) | 1 if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false)) return getTrue(ITy); break; case ICmpInst::ICMP_ULE: if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false)) return getTrue(ITy); break; } return nullptr; } /// Try hard to fold icmp with zero RHS because this is a common case. static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const SimplifyQuery &Q) { if (!match(RHS, m_Zero())) return nullptr; Type *ITy = GetCompareTy(LHS); // The return type. switch (Pred) { default: llvm_unreachable("Unknown ICmp predicate!"); case ICmpInst::ICMP_ULT: return getFalse(ITy); case ICmpInst::ICMP_UGE: return getTrue(ITy); case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULE: if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT, Q.IIQ.UseInstrInfo)) return getFalse(ITy); break; case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGT: if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT, Q.IIQ.UseInstrInfo)) return getTrue(ITy); break; case ICmpInst::ICMP_SLT: { KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (LHSKnown.isNegative()) return getTrue(ITy); if (LHSKnown.isNonNegative()) return getFalse(ITy); break; } case ICmpInst::ICMP_SLE: { KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (LHSKnown.isNegative()) return getTrue(ITy); if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) return getFalse(ITy); break; } case ICmpInst::ICMP_SGE: { KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (LHSKnown.isNegative()) return getFalse(ITy); if (LHSKnown.isNonNegative()) return getTrue(ITy); break; } case ICmpInst::ICMP_SGT: { KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (LHSKnown.isNegative()) return getFalse(ITy); if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) return getTrue(ITy); break; } } return nullptr; } static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const InstrInfoQuery &IIQ) { Type *ITy = GetCompareTy(RHS); // The return type. Value *X; // Sign-bit checks can be optimized to true/false after unsigned // floating-point casts: // icmp slt (bitcast (uitofp X)), 0 --> false // icmp sgt (bitcast (uitofp X)), -1 --> true if (match(LHS, m_BitCast(m_UIToFP(m_Value(X))))) { if (Pred == ICmpInst::ICMP_SLT && match(RHS, m_Zero())) return ConstantInt::getFalse(ITy); if (Pred == ICmpInst::ICMP_SGT && match(RHS, m_AllOnes())) return ConstantInt::getTrue(ITy); } const APInt *C; if (!match(RHS, m_APInt(C))) return nullptr; // Rule out tautological comparisons (eg., ult 0 or uge 0). ConstantRange RHS_CR = ConstantRange::makeExactICmpRegion(Pred, *C); if (RHS_CR.isEmptySet()) return ConstantInt::getFalse(ITy); if (RHS_CR.isFullSet()) return ConstantInt::getTrue(ITy); ConstantRange LHS_CR = computeConstantRange(LHS, IIQ.UseInstrInfo); if (!LHS_CR.isFullSet()) { if (RHS_CR.contains(LHS_CR)) return ConstantInt::getTrue(ITy); if (RHS_CR.inverse().contains(LHS_CR)) return ConstantInt::getFalse(ITy); } return nullptr; } /// TODO: A large part of this logic is duplicated in InstCombine's /// foldICmpBinOp(). We should be able to share that and avoid the code /// duplication. static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { Type *ITy = GetCompareTy(LHS); // The return type. BinaryOperator *LBO = dyn_cast(LHS); BinaryOperator *RBO = dyn_cast(RHS); if (MaxRecurse && (LBO || RBO)) { // Analyze the case when either LHS or RHS is an add instruction. Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; // LHS = A + B (or A and B are null); RHS = C + D (or C and D are null). bool NoLHSWrapProblem = false, NoRHSWrapProblem = false; if (LBO && LBO->getOpcode() == Instruction::Add) { A = LBO->getOperand(0); B = LBO->getOperand(1); NoLHSWrapProblem = ICmpInst::isEquality(Pred) || (CmpInst::isUnsigned(Pred) && Q.IIQ.hasNoUnsignedWrap(cast(LBO))) || (CmpInst::isSigned(Pred) && Q.IIQ.hasNoSignedWrap(cast(LBO))); } if (RBO && RBO->getOpcode() == Instruction::Add) { C = RBO->getOperand(0); D = RBO->getOperand(1); NoRHSWrapProblem = ICmpInst::isEquality(Pred) || (CmpInst::isUnsigned(Pred) && Q.IIQ.hasNoUnsignedWrap(cast(RBO))) || (CmpInst::isSigned(Pred) && Q.IIQ.hasNoSignedWrap(cast(RBO))); } // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow. if ((A == RHS || B == RHS) && NoLHSWrapProblem) if (Value *V = SimplifyICmpInst(Pred, A == RHS ? B : A, Constant::getNullValue(RHS->getType()), Q, MaxRecurse - 1)) return V; // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow. if ((C == LHS || D == LHS) && NoRHSWrapProblem) if (Value *V = SimplifyICmpInst(Pred, Constant::getNullValue(LHS->getType()), C == LHS ? D : C, Q, MaxRecurse - 1)) return V; // icmp (X+Y), (X+Z) -> icmp Y,Z for equalities or if there is no overflow. if (A && C && (A == C || A == D || B == C || B == D) && NoLHSWrapProblem && NoRHSWrapProblem) { // Determine Y and Z in the form icmp (X+Y), (X+Z). Value *Y, *Z; if (A == C) { // C + B == C + D -> B == D Y = B; Z = D; } else if (A == D) { // D + B == C + D -> B == C Y = B; Z = C; } else if (B == C) { // A + C == C + D -> A == D Y = A; Z = D; } else { assert(B == D); // A + D == C + D -> A == C Y = A; Z = C; } if (Value *V = SimplifyICmpInst(Pred, Y, Z, Q, MaxRecurse - 1)) return V; } } { Value *Y = nullptr; // icmp pred (or X, Y), X if (LBO && match(LBO, m_c_Or(m_Value(Y), m_Specific(RHS)))) { if (Pred == ICmpInst::ICMP_ULT) return getFalse(ITy); if (Pred == ICmpInst::ICMP_UGE) return getTrue(ITy); if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) { KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (RHSKnown.isNonNegative() && YKnown.isNegative()) return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy); if (RHSKnown.isNegative() || YKnown.isNonNegative()) return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy); } } // icmp pred X, (or X, Y) if (RBO && match(RBO, m_c_Or(m_Value(Y), m_Specific(LHS)))) { if (Pred == ICmpInst::ICMP_ULE) return getTrue(ITy); if (Pred == ICmpInst::ICMP_UGT) return getFalse(ITy); if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) { KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (LHSKnown.isNonNegative() && YKnown.isNegative()) return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy); if (LHSKnown.isNegative() || YKnown.isNonNegative()) return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy); } } } // icmp pred (and X, Y), X if (LBO && match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) { if (Pred == ICmpInst::ICMP_UGT) return getFalse(ITy); if (Pred == ICmpInst::ICMP_ULE) return getTrue(ITy); } // icmp pred X, (and X, Y) if (RBO && match(RBO, m_c_And(m_Value(), m_Specific(LHS)))) { if (Pred == ICmpInst::ICMP_UGE) return getTrue(ITy); if (Pred == ICmpInst::ICMP_ULT) return getFalse(ITy); } // 0 - (zext X) pred C if (!CmpInst::isUnsigned(Pred) && match(LHS, m_Neg(m_ZExt(m_Value())))) { if (ConstantInt *RHSC = dyn_cast(RHS)) { if (RHSC->getValue().isStrictlyPositive()) { if (Pred == ICmpInst::ICMP_SLT) return ConstantInt::getTrue(RHSC->getContext()); if (Pred == ICmpInst::ICMP_SGE) return ConstantInt::getFalse(RHSC->getContext()); if (Pred == ICmpInst::ICMP_EQ) return ConstantInt::getFalse(RHSC->getContext()); if (Pred == ICmpInst::ICMP_NE) return ConstantInt::getTrue(RHSC->getContext()); } if (RHSC->getValue().isNonNegative()) { if (Pred == ICmpInst::ICMP_SLE) return ConstantInt::getTrue(RHSC->getContext()); if (Pred == ICmpInst::ICMP_SGT) return ConstantInt::getFalse(RHSC->getContext()); } } } // icmp pred (urem X, Y), Y if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) { switch (Pred) { default: break; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: { KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (!Known.isNonNegative()) break; LLVM_FALLTHROUGH; } case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: return getFalse(ITy); case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: { KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (!Known.isNonNegative()) break; LLVM_FALLTHROUGH; } case ICmpInst::ICMP_NE: case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: return getTrue(ITy); } } // icmp pred X, (urem Y, X) if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) { switch (Pred) { default: break; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: { KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (!Known.isNonNegative()) break; LLVM_FALLTHROUGH; } case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: return getTrue(ITy); case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: { KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (!Known.isNonNegative()) break; LLVM_FALLTHROUGH; } case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: return getFalse(ITy); } } // x >> y <=u x // x udiv y <=u x. if (LBO && (match(LBO, m_LShr(m_Specific(RHS), m_Value())) || match(LBO, m_UDiv(m_Specific(RHS), m_Value())))) { // icmp pred (X op Y), X if (Pred == ICmpInst::ICMP_UGT) return getFalse(ITy); if (Pred == ICmpInst::ICMP_ULE) return getTrue(ITy); } // x >=u x >> y // x >=u x udiv y. if (RBO && (match(RBO, m_LShr(m_Specific(LHS), m_Value())) || match(RBO, m_UDiv(m_Specific(LHS), m_Value())))) { // icmp pred X, (X op Y) if (Pred == ICmpInst::ICMP_ULT) return getFalse(ITy); if (Pred == ICmpInst::ICMP_UGE) return getTrue(ITy); } // handle: // CI2 << X == CI // CI2 << X != CI // // where CI2 is a power of 2 and CI isn't if (auto *CI = dyn_cast(RHS)) { const APInt *CI2Val, *CIVal = &CI->getValue(); if (LBO && match(LBO, m_Shl(m_APInt(CI2Val), m_Value())) && CI2Val->isPowerOf2()) { if (!CIVal->isPowerOf2()) { // CI2 << X can equal zero in some circumstances, // this simplification is unsafe if CI is zero. // // We know it is safe if: // - The shift is nsw, we can't shift out the one bit. // - The shift is nuw, we can't shift out the one bit. // - CI2 is one // - CI isn't zero if (Q.IIQ.hasNoSignedWrap(cast(LBO)) || Q.IIQ.hasNoUnsignedWrap(cast(LBO)) || CI2Val->isOneValue() || !CI->isZero()) { if (Pred == ICmpInst::ICMP_EQ) return ConstantInt::getFalse(RHS->getContext()); if (Pred == ICmpInst::ICMP_NE) return ConstantInt::getTrue(RHS->getContext()); } } if (CIVal->isSignMask() && CI2Val->isOneValue()) { if (Pred == ICmpInst::ICMP_UGT) return ConstantInt::getFalse(RHS->getContext()); if (Pred == ICmpInst::ICMP_ULE) return ConstantInt::getTrue(RHS->getContext()); } } } if (MaxRecurse && LBO && RBO && LBO->getOpcode() == RBO->getOpcode() && LBO->getOperand(1) == RBO->getOperand(1)) { switch (LBO->getOpcode()) { default: break; case Instruction::UDiv: case Instruction::LShr: if (ICmpInst::isSigned(Pred) || !Q.IIQ.isExact(LBO) || !Q.IIQ.isExact(RBO)) break; if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0), RBO->getOperand(0), Q, MaxRecurse - 1)) return V; break; case Instruction::SDiv: if (!ICmpInst::isEquality(Pred) || !Q.IIQ.isExact(LBO) || !Q.IIQ.isExact(RBO)) break; if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0), RBO->getOperand(0), Q, MaxRecurse - 1)) return V; break; case Instruction::AShr: if (!Q.IIQ.isExact(LBO) || !Q.IIQ.isExact(RBO)) break; if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0), RBO->getOperand(0), Q, MaxRecurse - 1)) return V; break; case Instruction::Shl: { bool NUW = Q.IIQ.hasNoUnsignedWrap(LBO) && Q.IIQ.hasNoUnsignedWrap(RBO); bool NSW = Q.IIQ.hasNoSignedWrap(LBO) && Q.IIQ.hasNoSignedWrap(RBO); if (!NUW && !NSW) break; if (!NSW && ICmpInst::isSigned(Pred)) break; if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0), RBO->getOperand(0), Q, MaxRecurse - 1)) return V; break; } } } return nullptr; } /// Simplify integer comparisons where at least one operand of the compare /// matches an integer min/max idiom. static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { Type *ITy = GetCompareTy(LHS); // The return type. Value *A, *B; CmpInst::Predicate P = CmpInst::BAD_ICMP_PREDICATE; CmpInst::Predicate EqP; // Chosen so that "A == max/min(A,B)" iff "A EqP B". // Signed variants on "max(a,b)>=a -> true". if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) { if (A != RHS) std::swap(A, B); // smax(A, B) pred A. EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B". // We analyze this as smax(A, B) pred A. P = Pred; } else if (match(RHS, m_SMax(m_Value(A), m_Value(B))) && (A == LHS || B == LHS)) { if (A != LHS) std::swap(A, B); // A pred smax(A, B). EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B". // We analyze this as smax(A, B) swapped-pred A. P = CmpInst::getSwappedPredicate(Pred); } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) { if (A != RHS) std::swap(A, B); // smin(A, B) pred A. EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B". // We analyze this as smax(-A, -B) swapped-pred -A. // Note that we do not need to actually form -A or -B thanks to EqP. P = CmpInst::getSwappedPredicate(Pred); } else if (match(RHS, m_SMin(m_Value(A), m_Value(B))) && (A == LHS || B == LHS)) { if (A != LHS) std::swap(A, B); // A pred smin(A, B). EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B". // We analyze this as smax(-A, -B) pred -A. // Note that we do not need to actually form -A or -B thanks to EqP. P = Pred; } if (P != CmpInst::BAD_ICMP_PREDICATE) { // Cases correspond to "max(A, B) p A". switch (P) { default: break; case CmpInst::ICMP_EQ: case CmpInst::ICMP_SLE: // Equivalent to "A EqP B". This may be the same as the condition tested // in the max/min; if so, we can just return that. if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B)) return V; if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B)) return V; // Otherwise, see if "A EqP B" simplifies. if (MaxRecurse) if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1)) return V; break; case CmpInst::ICMP_NE: case CmpInst::ICMP_SGT: { CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP); // Equivalent to "A InvEqP B". This may be the same as the condition // tested in the max/min; if so, we can just return that. if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B)) return V; if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B)) return V; // Otherwise, see if "A InvEqP B" simplifies. if (MaxRecurse) if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1)) return V; break; } case CmpInst::ICMP_SGE: // Always true. return getTrue(ITy); case CmpInst::ICMP_SLT: // Always false. return getFalse(ITy); } } // Unsigned variants on "max(a,b)>=a -> true". P = CmpInst::BAD_ICMP_PREDICATE; if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) { if (A != RHS) std::swap(A, B); // umax(A, B) pred A. EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B". // We analyze this as umax(A, B) pred A. P = Pred; } else if (match(RHS, m_UMax(m_Value(A), m_Value(B))) && (A == LHS || B == LHS)) { if (A != LHS) std::swap(A, B); // A pred umax(A, B). EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B". // We analyze this as umax(A, B) swapped-pred A. P = CmpInst::getSwappedPredicate(Pred); } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) { if (A != RHS) std::swap(A, B); // umin(A, B) pred A. EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B". // We analyze this as umax(-A, -B) swapped-pred -A. // Note that we do not need to actually form -A or -B thanks to EqP. P = CmpInst::getSwappedPredicate(Pred); } else if (match(RHS, m_UMin(m_Value(A), m_Value(B))) && (A == LHS || B == LHS)) { if (A != LHS) std::swap(A, B); // A pred umin(A, B). EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B". // We analyze this as umax(-A, -B) pred -A. // Note that we do not need to actually form -A or -B thanks to EqP. P = Pred; } if (P != CmpInst::BAD_ICMP_PREDICATE) { // Cases correspond to "max(A, B) p A". switch (P) { default: break; case CmpInst::ICMP_EQ: case CmpInst::ICMP_ULE: // Equivalent to "A EqP B". This may be the same as the condition tested // in the max/min; if so, we can just return that. if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B)) return V; if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B)) return V; // Otherwise, see if "A EqP B" simplifies. if (MaxRecurse) if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse - 1)) return V; break; case CmpInst::ICMP_NE: case CmpInst::ICMP_UGT: { CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP); // Equivalent to "A InvEqP B". This may be the same as the condition // tested in the max/min; if so, we can just return that. if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B)) return V; if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B)) return V; // Otherwise, see if "A InvEqP B" simplifies. if (MaxRecurse) if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse - 1)) return V; break; } case CmpInst::ICMP_UGE: // Always true. return getTrue(ITy); case CmpInst::ICMP_ULT: // Always false. return getFalse(ITy); } } // Variants on "max(x,y) >= min(x,z)". Value *C, *D; if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && match(RHS, m_SMin(m_Value(C), m_Value(D))) && (A == C || A == D || B == C || B == D)) { // max(x, ?) pred min(x, ?). if (Pred == CmpInst::ICMP_SGE) // Always true. return getTrue(ITy); if (Pred == CmpInst::ICMP_SLT) // Always false. return getFalse(ITy); } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) && match(RHS, m_SMax(m_Value(C), m_Value(D))) && (A == C || A == D || B == C || B == D)) { // min(x, ?) pred max(x, ?). if (Pred == CmpInst::ICMP_SLE) // Always true. return getTrue(ITy); if (Pred == CmpInst::ICMP_SGT) // Always false. return getFalse(ITy); } else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && match(RHS, m_UMin(m_Value(C), m_Value(D))) && (A == C || A == D || B == C || B == D)) { // max(x, ?) pred min(x, ?). if (Pred == CmpInst::ICMP_UGE) // Always true. return getTrue(ITy); if (Pred == CmpInst::ICMP_ULT) // Always false. return getFalse(ITy); } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) && match(RHS, m_UMax(m_Value(C), m_Value(D))) && (A == C || A == D || B == C || B == D)) { // min(x, ?) pred max(x, ?). if (Pred == CmpInst::ICMP_ULE) // Always true. return getTrue(ITy); if (Pred == CmpInst::ICMP_UGT) // Always false. return getFalse(ITy); } return nullptr; } /// Given operands for an ICmpInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate; assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!"); if (Constant *CLHS = dyn_cast(LHS)) { if (Constant *CRHS = dyn_cast(RHS)) return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI); // If we have a constant, make sure it is on the RHS. std::swap(LHS, RHS); Pred = CmpInst::getSwappedPredicate(Pred); } assert(!isa(LHS) && "Unexpected icmp undef,%X"); Type *ITy = GetCompareTy(LHS); // The return type. // For EQ and NE, we can always pick a value for the undef to make the // predicate pass or fail, so we can return undef. // Matches behavior in llvm::ConstantFoldCompareInstruction. if (isa(RHS) && ICmpInst::isEquality(Pred)) return UndefValue::get(ITy); // icmp X, X -> true/false // icmp X, undef -> true/false because undef could be X. if (LHS == RHS || isa(RHS)) return ConstantInt::get(ITy, CmpInst::isTrueWhenEqual(Pred)); if (Value *V = simplifyICmpOfBools(Pred, LHS, RHS, Q)) return V; if (Value *V = simplifyICmpWithZero(Pred, LHS, RHS, Q)) return V; if (Value *V = simplifyICmpWithConstant(Pred, LHS, RHS, Q.IIQ)) return V; // If both operands have range metadata, use the metadata // to simplify the comparison. if (isa(RHS) && isa(LHS)) { auto RHS_Instr = cast(RHS); auto LHS_Instr = cast(LHS); if (Q.IIQ.getMetadata(RHS_Instr, LLVMContext::MD_range) && Q.IIQ.getMetadata(LHS_Instr, LLVMContext::MD_range)) { auto RHS_CR = getConstantRangeFromMetadata( *RHS_Instr->getMetadata(LLVMContext::MD_range)); auto LHS_CR = getConstantRangeFromMetadata( *LHS_Instr->getMetadata(LLVMContext::MD_range)); auto Satisfied_CR = ConstantRange::makeSatisfyingICmpRegion(Pred, RHS_CR); if (Satisfied_CR.contains(LHS_CR)) return ConstantInt::getTrue(RHS->getContext()); auto InversedSatisfied_CR = ConstantRange::makeSatisfyingICmpRegion( CmpInst::getInversePredicate(Pred), RHS_CR); if (InversedSatisfied_CR.contains(LHS_CR)) return ConstantInt::getFalse(RHS->getContext()); } } // Compare of cast, for example (zext X) != 0 -> X != 0 if (isa(LHS) && (isa(RHS) || isa(RHS))) { Instruction *LI = cast(LHS); Value *SrcOp = LI->getOperand(0); Type *SrcTy = SrcOp->getType(); Type *DstTy = LI->getType(); // Turn icmp (ptrtoint x), (ptrtoint/constant) into a compare of the input // if the integer type is the same size as the pointer type. if (MaxRecurse && isa(LI) && Q.DL.getTypeSizeInBits(SrcTy) == DstTy->getPrimitiveSizeInBits()) { if (Constant *RHSC = dyn_cast(RHS)) { // Transfer the cast to the constant. if (Value *V = SimplifyICmpInst(Pred, SrcOp, ConstantExpr::getIntToPtr(RHSC, SrcTy), Q, MaxRecurse-1)) return V; } else if (PtrToIntInst *RI = dyn_cast(RHS)) { if (RI->getOperand(0)->getType() == SrcTy) // Compare without the cast. if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0), Q, MaxRecurse-1)) return V; } } if (isa(LHS)) { // Turn icmp (zext X), (zext Y) into a compare of X and Y if they have the // same type. if (ZExtInst *RI = dyn_cast(RHS)) { if (MaxRecurse && SrcTy == RI->getOperand(0)->getType()) // Compare X and Y. Note that signed predicates become unsigned. if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred), SrcOp, RI->getOperand(0), Q, MaxRecurse-1)) return V; } // Turn icmp (zext X), Cst into a compare of X and Cst if Cst is extended // too. If not, then try to deduce the result of the comparison. else if (ConstantInt *CI = dyn_cast(RHS)) { // Compute the constant that would happen if we truncated to SrcTy then // reextended to DstTy. Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy); Constant *RExt = ConstantExpr::getCast(CastInst::ZExt, Trunc, DstTy); // If the re-extended constant didn't change then this is effectively // also a case of comparing two zero-extended values. if (RExt == CI && MaxRecurse) if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred), SrcOp, Trunc, Q, MaxRecurse-1)) return V; // Otherwise the upper bits of LHS are zero while RHS has a non-zero bit // there. Use this to work out the result of the comparison. if (RExt != CI) { switch (Pred) { default: llvm_unreachable("Unknown ICmp predicate!"); // LHS getContext()); case ICmpInst::ICMP_NE: case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: return ConstantInt::getTrue(CI->getContext()); // LHS is non-negative. If RHS is negative then LHS >s LHS. If RHS // is non-negative then LHS getValue().isNegative() ? ConstantInt::getTrue(CI->getContext()) : ConstantInt::getFalse(CI->getContext()); case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: return CI->getValue().isNegative() ? ConstantInt::getFalse(CI->getContext()) : ConstantInt::getTrue(CI->getContext()); } } } } if (isa(LHS)) { // Turn icmp (sext X), (sext Y) into a compare of X and Y if they have the // same type. if (SExtInst *RI = dyn_cast(RHS)) { if (MaxRecurse && SrcTy == RI->getOperand(0)->getType()) // Compare X and Y. Note that the predicate does not change. if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0), Q, MaxRecurse-1)) return V; } // Turn icmp (sext X), Cst into a compare of X and Cst if Cst is extended // too. If not, then try to deduce the result of the comparison. else if (ConstantInt *CI = dyn_cast(RHS)) { // Compute the constant that would happen if we truncated to SrcTy then // reextended to DstTy. Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy); Constant *RExt = ConstantExpr::getCast(CastInst::SExt, Trunc, DstTy); // If the re-extended constant didn't change then this is effectively // also a case of comparing two sign-extended values. if (RExt == CI && MaxRecurse) if (Value *V = SimplifyICmpInst(Pred, SrcOp, Trunc, Q, MaxRecurse-1)) return V; // Otherwise the upper bits of LHS are all equal, while RHS has varying // bits there. Use this to work out the result of the comparison. if (RExt != CI) { switch (Pred) { default: llvm_unreachable("Unknown ICmp predicate!"); case ICmpInst::ICMP_EQ: return ConstantInt::getFalse(CI->getContext()); case ICmpInst::ICMP_NE: return ConstantInt::getTrue(CI->getContext()); // If RHS is non-negative then LHS s RHS. case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: return CI->getValue().isNegative() ? ConstantInt::getTrue(CI->getContext()) : ConstantInt::getFalse(CI->getContext()); case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: return CI->getValue().isNegative() ? ConstantInt::getFalse(CI->getContext()) : ConstantInt::getTrue(CI->getContext()); // If LHS is non-negative then LHS u RHS. case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: // Comparison is true iff the LHS =s 0. if (MaxRecurse) if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SGE, SrcOp, Constant::getNullValue(SrcTy), Q, MaxRecurse-1)) return V; break; } } } } } // icmp eq|ne X, Y -> false|true if X != Y if (ICmpInst::isEquality(Pred) && isKnownNonEqual(LHS, RHS, Q.DL, Q.AC, Q.CxtI, Q.DT, Q.IIQ.UseInstrInfo)) { return Pred == ICmpInst::ICMP_NE ? getTrue(ITy) : getFalse(ITy); } if (Value *V = simplifyICmpWithBinOp(Pred, LHS, RHS, Q, MaxRecurse)) return V; if (Value *V = simplifyICmpWithMinMax(Pred, LHS, RHS, Q, MaxRecurse)) return V; // Simplify comparisons of related pointers using a powerful, recursive // GEP-walk when we have target data available.. if (LHS->getType()->isPointerTy()) if (auto *C = computePointerICmp(Q.DL, Q.TLI, Q.DT, Pred, Q.AC, Q.CxtI, Q.IIQ, LHS, RHS)) return C; if (auto *CLHS = dyn_cast(LHS)) if (auto *CRHS = dyn_cast(RHS)) if (Q.DL.getTypeSizeInBits(CLHS->getPointerOperandType()) == Q.DL.getTypeSizeInBits(CLHS->getType()) && Q.DL.getTypeSizeInBits(CRHS->getPointerOperandType()) == Q.DL.getTypeSizeInBits(CRHS->getType())) if (auto *C = computePointerICmp(Q.DL, Q.TLI, Q.DT, Pred, Q.AC, Q.CxtI, Q.IIQ, CLHS->getPointerOperand(), CRHS->getPointerOperand())) return C; if (GetElementPtrInst *GLHS = dyn_cast(LHS)) { if (GEPOperator *GRHS = dyn_cast(RHS)) { if (GLHS->getPointerOperand() == GRHS->getPointerOperand() && GLHS->hasAllConstantIndices() && GRHS->hasAllConstantIndices() && (ICmpInst::isEquality(Pred) || (GLHS->isInBounds() && GRHS->isInBounds() && Pred == ICmpInst::getSignedPredicate(Pred)))) { // The bases are equal and the indices are constant. Build a constant // expression GEP with the same indices and a null base pointer to see // what constant folding can make out of it. Constant *Null = Constant::getNullValue(GLHS->getPointerOperandType()); SmallVector IndicesLHS(GLHS->idx_begin(), GLHS->idx_end()); Constant *NewLHS = ConstantExpr::getGetElementPtr( GLHS->getSourceElementType(), Null, IndicesLHS); SmallVector IndicesRHS(GRHS->idx_begin(), GRHS->idx_end()); Constant *NewRHS = ConstantExpr::getGetElementPtr( GLHS->getSourceElementType(), Null, IndicesRHS); return ConstantExpr::getICmp(Pred, NewLHS, NewRHS); } } } // If the comparison is with the result of a select instruction, check whether // comparing with either branch of the select always yields the same value. if (isa(LHS) || isa(RHS)) if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse)) return V; // If the comparison is with the result of a phi instruction, check whether // doing the compare with each incoming phi value yields a common result. if (isa(LHS) || isa(RHS)) if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse)) return V; return nullptr; } Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q) { return ::SimplifyICmpInst(Predicate, LHS, RHS, Q, RecursionLimit); } /// Given operands for an FCmpInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate; assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!"); if (Constant *CLHS = dyn_cast(LHS)) { if (Constant *CRHS = dyn_cast(RHS)) return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.DL, Q.TLI); // If we have a constant, make sure it is on the RHS. std::swap(LHS, RHS); Pred = CmpInst::getSwappedPredicate(Pred); } // Fold trivial predicates. Type *RetTy = GetCompareTy(LHS); if (Pred == FCmpInst::FCMP_FALSE) return getFalse(RetTy); if (Pred == FCmpInst::FCMP_TRUE) return getTrue(RetTy); // Fold (un)ordered comparison if we can determine there are no NaNs. if (Pred == FCmpInst::FCMP_UNO || Pred == FCmpInst::FCMP_ORD) if (FMF.noNaNs() || (isKnownNeverNaN(LHS, Q.TLI) && isKnownNeverNaN(RHS, Q.TLI))) return ConstantInt::get(RetTy, Pred == FCmpInst::FCMP_ORD); // NaN is unordered; NaN is not ordered. assert((FCmpInst::isOrdered(Pred) || FCmpInst::isUnordered(Pred)) && "Comparison must be either ordered or unordered"); if (match(RHS, m_NaN())) return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred)); // fcmp pred x, undef and fcmp pred undef, x // fold to true if unordered, false if ordered if (isa(LHS) || isa(RHS)) { // Choosing NaN for the undef will always make unordered comparison succeed // and ordered comparison fail. return ConstantInt::get(RetTy, CmpInst::isUnordered(Pred)); } // fcmp x,x -> true/false. Not all compares are foldable. if (LHS == RHS) { if (CmpInst::isTrueWhenEqual(Pred)) return getTrue(RetTy); if (CmpInst::isFalseWhenEqual(Pred)) return getFalse(RetTy); } // Handle fcmp with constant RHS. // TODO: Use match with a specific FP value, so these work with vectors with // undef lanes. const APFloat *C; if (match(RHS, m_APFloat(C))) { // Check whether the constant is an infinity. if (C->isInfinity()) { if (C->isNegative()) { switch (Pred) { case FCmpInst::FCMP_OLT: // No value is ordered and less than negative infinity. return getFalse(RetTy); case FCmpInst::FCMP_UGE: // All values are unordered with or at least negative infinity. return getTrue(RetTy); default: break; } } else { switch (Pred) { case FCmpInst::FCMP_OGT: // No value is ordered and greater than infinity. return getFalse(RetTy); case FCmpInst::FCMP_ULE: // All values are unordered with and at most infinity. return getTrue(RetTy); default: break; } } } if (C->isNegative() && !C->isNegZero()) { assert(!C->isNaN() && "Unexpected NaN constant!"); // TODO: We can catch more cases by using a range check rather than // relying on CannotBeOrderedLessThanZero. switch (Pred) { case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_UGT: case FCmpInst::FCMP_UNE: // (X >= 0) implies (X > C) when (C < 0) if (CannotBeOrderedLessThanZero(LHS, Q.TLI)) return getTrue(RetTy); break; case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_OLE: case FCmpInst::FCMP_OLT: // (X >= 0) implies !(X < C) when (C < 0) if (CannotBeOrderedLessThanZero(LHS, Q.TLI)) return getFalse(RetTy); break; default: break; } } // Check comparison of [minnum/maxnum with constant] with other constant. const APFloat *C2; if ((match(LHS, m_Intrinsic(m_Value(), m_APFloat(C2))) && C2->compare(*C) == APFloat::cmpLessThan) || (match(LHS, m_Intrinsic(m_Value(), m_APFloat(C2))) && C2->compare(*C) == APFloat::cmpGreaterThan)) { bool IsMaxNum = cast(LHS)->getIntrinsicID() == Intrinsic::maxnum; // The ordered relationship and minnum/maxnum guarantee that we do not // have NaN constants, so ordered/unordered preds are handled the same. switch (Pred) { case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_UEQ: // minnum(X, LesserC) == C --> false // maxnum(X, GreaterC) == C --> false return getFalse(RetTy); case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_UNE: // minnum(X, LesserC) != C --> true // maxnum(X, GreaterC) != C --> true return getTrue(RetTy); case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_OGT: case FCmpInst::FCMP_UGT: // minnum(X, LesserC) >= C --> false // minnum(X, LesserC) > C --> false // maxnum(X, GreaterC) >= C --> true // maxnum(X, GreaterC) > C --> true return ConstantInt::get(RetTy, IsMaxNum); case FCmpInst::FCMP_OLE: case FCmpInst::FCMP_ULE: case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_ULT: // minnum(X, LesserC) <= C --> true // minnum(X, LesserC) < C --> true // maxnum(X, GreaterC) <= C --> false // maxnum(X, GreaterC) < C --> false return ConstantInt::get(RetTy, !IsMaxNum); default: // TRUE/FALSE/ORD/UNO should be handled before this. llvm_unreachable("Unexpected fcmp predicate"); } } } if (match(RHS, m_AnyZeroFP())) { switch (Pred) { case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_ULT: // Positive or zero X >= 0.0 --> true // Positive or zero X < 0.0 --> false if ((FMF.noNaNs() || isKnownNeverNaN(LHS, Q.TLI)) && CannotBeOrderedLessThanZero(LHS, Q.TLI)) return Pred == FCmpInst::FCMP_OGE ? getTrue(RetTy) : getFalse(RetTy); break; case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_OLT: // Positive or zero or nan X >= 0.0 --> true // Positive or zero or nan X < 0.0 --> false if (CannotBeOrderedLessThanZero(LHS, Q.TLI)) return Pred == FCmpInst::FCMP_UGE ? getTrue(RetTy) : getFalse(RetTy); break; default: break; } } // If the comparison is with the result of a select instruction, check whether // comparing with either branch of the select always yields the same value. if (isa(LHS) || isa(RHS)) if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse)) return V; // If the comparison is with the result of a phi instruction, check whether // doing the compare with each incoming phi value yields a common result. if (isa(LHS) || isa(RHS)) if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse)) return V; return nullptr; } Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q) { return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit); } /// See if V simplifies when its operand Op is replaced with RepOp. static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, const SimplifyQuery &Q, unsigned MaxRecurse) { // Trivial replacement. if (V == Op) return RepOp; // We cannot replace a constant, and shouldn't even try. if (isa(Op)) return nullptr; auto *I = dyn_cast(V); if (!I) return nullptr; // If this is a binary operator, try to simplify it with the replaced op. if (auto *B = dyn_cast(I)) { // Consider: // %cmp = icmp eq i32 %x, 2147483647 // %add = add nsw i32 %x, 1 // %sel = select i1 %cmp, i32 -2147483648, i32 %add // // We can't replace %sel with %add unless we strip away the flags. if (isa(B)) if (Q.IIQ.hasNoSignedWrap(B) || Q.IIQ.hasNoUnsignedWrap(B)) return nullptr; if (isa(B) && Q.IIQ.isExact(B)) return nullptr; if (MaxRecurse) { if (B->getOperand(0) == Op) return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), Q, MaxRecurse - 1); if (B->getOperand(1) == Op) return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, Q, MaxRecurse - 1); } } // Same for CmpInsts. if (CmpInst *C = dyn_cast(I)) { if (MaxRecurse) { if (C->getOperand(0) == Op) return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), Q, MaxRecurse - 1); if (C->getOperand(1) == Op) return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, Q, MaxRecurse - 1); } } // Same for GEPs. if (auto *GEP = dyn_cast(I)) { if (MaxRecurse) { SmallVector NewOps(GEP->getNumOperands()); transform(GEP->operands(), NewOps.begin(), [&](Value *V) { return V == Op ? RepOp : V; }); return SimplifyGEPInst(GEP->getSourceElementType(), NewOps, Q, MaxRecurse - 1); } } // TODO: We could hand off more cases to instsimplify here. // If all operands are constant after substituting Op for RepOp then we can // constant fold the instruction. if (Constant *CRepOp = dyn_cast(RepOp)) { // Build a list of all constant operands. SmallVector ConstOps; for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { if (I->getOperand(i) == Op) ConstOps.push_back(CRepOp); else if (Constant *COp = dyn_cast(I->getOperand(i))) ConstOps.push_back(COp); else break; } // All operands were constants, fold it. if (ConstOps.size() == I->getNumOperands()) { if (CmpInst *C = dyn_cast(I)) return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0], ConstOps[1], Q.DL, Q.TLI); if (LoadInst *LI = dyn_cast(I)) if (!LI->isVolatile()) return ConstantFoldLoadFromConstPtr(ConstOps[0], LI->getType(), Q.DL); return ConstantFoldInstOperands(I, ConstOps, Q.DL, Q.TLI); } } return nullptr; } /// Try to simplify a select instruction when its condition operand is an /// integer comparison where one operand of the compare is a constant. static Value *simplifySelectBitTest(Value *TrueVal, Value *FalseVal, Value *X, const APInt *Y, bool TrueWhenUnset) { const APInt *C; // (X & Y) == 0 ? X & ~Y : X --> X // (X & Y) != 0 ? X & ~Y : X --> X & ~Y if (FalseVal == X && match(TrueVal, m_And(m_Specific(X), m_APInt(C))) && *Y == ~*C) return TrueWhenUnset ? FalseVal : TrueVal; // (X & Y) == 0 ? X : X & ~Y --> X & ~Y // (X & Y) != 0 ? X : X & ~Y --> X if (TrueVal == X && match(FalseVal, m_And(m_Specific(X), m_APInt(C))) && *Y == ~*C) return TrueWhenUnset ? FalseVal : TrueVal; if (Y->isPowerOf2()) { // (X & Y) == 0 ? X | Y : X --> X | Y // (X & Y) != 0 ? X | Y : X --> X if (FalseVal == X && match(TrueVal, m_Or(m_Specific(X), m_APInt(C))) && *Y == *C) return TrueWhenUnset ? TrueVal : FalseVal; // (X & Y) == 0 ? X : X | Y --> X // (X & Y) != 0 ? X : X | Y --> X | Y if (TrueVal == X && match(FalseVal, m_Or(m_Specific(X), m_APInt(C))) && *Y == *C) return TrueWhenUnset ? TrueVal : FalseVal; } return nullptr; } /// An alternative way to test if a bit is set or not uses sgt/slt instead of /// eq/ne. static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS, ICmpInst::Predicate Pred, Value *TrueVal, Value *FalseVal) { Value *X; APInt Mask; if (!decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, X, Mask)) return nullptr; return simplifySelectBitTest(TrueVal, FalseVal, X, &Mask, Pred == ICmpInst::ICMP_EQ); } /// Try to simplify a select instruction when its condition operand is an /// integer comparison. static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, Value *FalseVal, const SimplifyQuery &Q, unsigned MaxRecurse) { ICmpInst::Predicate Pred; Value *CmpLHS, *CmpRHS; if (!match(CondVal, m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)))) return nullptr; if (ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero())) { Value *X; const APInt *Y; if (match(CmpLHS, m_And(m_Value(X), m_APInt(Y)))) if (Value *V = simplifySelectBitTest(TrueVal, FalseVal, X, Y, Pred == ICmpInst::ICMP_EQ)) return V; // Test for a bogus zero-shift-guard-op around funnel-shift or rotate. Value *ShAmt; auto isFsh = m_CombineOr(m_Intrinsic(m_Value(X), m_Value(), m_Value(ShAmt)), m_Intrinsic(m_Value(), m_Value(X), m_Value(ShAmt))); // (ShAmt == 0) ? fshl(X, *, ShAmt) : X --> X // (ShAmt == 0) ? fshr(*, X, ShAmt) : X --> X if (match(TrueVal, isFsh) && FalseVal == X && CmpLHS == ShAmt && Pred == ICmpInst::ICMP_EQ) return X; // (ShAmt != 0) ? X : fshl(X, *, ShAmt) --> X // (ShAmt != 0) ? X : fshr(*, X, ShAmt) --> X if (match(FalseVal, isFsh) && TrueVal == X && CmpLHS == ShAmt && Pred == ICmpInst::ICMP_NE) return X; // Test for a zero-shift-guard-op around rotates. These are used to // avoid UB from oversized shifts in raw IR rotate patterns, but the // intrinsics do not have that problem. // We do not allow this transform for the general funnel shift case because // that would not preserve the poison safety of the original code. auto isRotate = m_CombineOr(m_Intrinsic(m_Value(X), m_Deferred(X), m_Value(ShAmt)), m_Intrinsic(m_Value(X), m_Deferred(X), m_Value(ShAmt))); // (ShAmt != 0) ? fshl(X, X, ShAmt) : X --> fshl(X, X, ShAmt) // (ShAmt != 0) ? fshr(X, X, ShAmt) : X --> fshr(X, X, ShAmt) if (match(TrueVal, isRotate) && FalseVal == X && CmpLHS == ShAmt && Pred == ICmpInst::ICMP_NE) return TrueVal; // (ShAmt == 0) ? X : fshl(X, X, ShAmt) --> fshl(X, X, ShAmt) // (ShAmt == 0) ? X : fshr(X, X, ShAmt) --> fshr(X, X, ShAmt) if (match(FalseVal, isRotate) && TrueVal == X && CmpLHS == ShAmt && Pred == ICmpInst::ICMP_EQ) return FalseVal; } // Check for other compares that behave like bit test. if (Value *V = simplifySelectWithFakeICmpEq(CmpLHS, CmpRHS, Pred, TrueVal, FalseVal)) return V; // If we have an equality comparison, then we know the value in one of the // arms of the select. See if substituting this value into the arm and // simplifying the result yields the same value as the other arm. if (Pred == ICmpInst::ICMP_EQ) { if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) == TrueVal || SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) == TrueVal) return FalseVal; if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) == FalseVal || SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) == FalseVal) return FalseVal; } else if (Pred == ICmpInst::ICMP_NE) { if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) == FalseVal || SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) == FalseVal) return TrueVal; if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) == TrueVal || SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) == TrueVal) return TrueVal; } return nullptr; } /// Try to simplify a select instruction when its condition operand is a /// floating-point comparison. static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F) { FCmpInst::Predicate Pred; if (!match(Cond, m_FCmp(Pred, m_Specific(T), m_Specific(F))) && !match(Cond, m_FCmp(Pred, m_Specific(F), m_Specific(T)))) return nullptr; // TODO: The transform may not be valid with -0.0. An incomplete way of // testing for that possibility is to check if at least one operand is a // non-zero constant. const APFloat *C; if ((match(T, m_APFloat(C)) && C->isNonZero()) || (match(F, m_APFloat(C)) && C->isNonZero())) { // (T == F) ? T : F --> F // (F == T) ? T : F --> F if (Pred == FCmpInst::FCMP_OEQ) return F; // (T != F) ? T : F --> T // (F != T) ? T : F --> T if (Pred == FCmpInst::FCMP_UNE) return T; } return nullptr; } /// Given operands for a SelectInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, const SimplifyQuery &Q, unsigned MaxRecurse) { if (auto *CondC = dyn_cast(Cond)) { if (auto *TrueC = dyn_cast(TrueVal)) if (auto *FalseC = dyn_cast(FalseVal)) return ConstantFoldSelectInstruction(CondC, TrueC, FalseC); // select undef, X, Y -> X or Y if (isa(CondC)) return isa(FalseVal) ? FalseVal : TrueVal; // TODO: Vector constants with undef elements don't simplify. // select true, X, Y -> X if (CondC->isAllOnesValue()) return TrueVal; // select false, X, Y -> Y if (CondC->isNullValue()) return FalseVal; } // select ?, X, X -> X if (TrueVal == FalseVal) return TrueVal; if (isa(TrueVal)) // select ?, undef, X -> X return FalseVal; if (isa(FalseVal)) // select ?, X, undef -> X return TrueVal; if (Value *V = simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse)) return V; if (Value *V = simplifySelectWithFCmp(Cond, TrueVal, FalseVal)) return V; if (Value *V = foldSelectWithBinaryOp(Cond, TrueVal, FalseVal)) return V; Optional Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL); if (Imp) return *Imp ? TrueVal : FalseVal; return nullptr; } Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, const SimplifyQuery &Q) { return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Q, RecursionLimit); } /// Given operands for an GetElementPtrInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, const SimplifyQuery &Q, unsigned) { // The type of the GEP pointer operand. unsigned AS = cast(Ops[0]->getType()->getScalarType())->getAddressSpace(); // getelementptr P -> P. if (Ops.size() == 1) return Ops[0]; // Compute the (pointer) type returned by the GEP instruction. Type *LastType = GetElementPtrInst::getIndexedType(SrcTy, Ops.slice(1)); Type *GEPTy = PointerType::get(LastType, AS); if (VectorType *VT = dyn_cast(Ops[0]->getType())) GEPTy = VectorType::get(GEPTy, VT->getNumElements()); else if (VectorType *VT = dyn_cast(Ops[1]->getType())) GEPTy = VectorType::get(GEPTy, VT->getNumElements()); if (isa(Ops[0])) return UndefValue::get(GEPTy); if (Ops.size() == 2) { // getelementptr P, 0 -> P. if (match(Ops[1], m_Zero()) && Ops[0]->getType() == GEPTy) return Ops[0]; Type *Ty = SrcTy; if (Ty->isSized()) { Value *P; uint64_t C; uint64_t TyAllocSize = Q.DL.getTypeAllocSize(Ty); // getelementptr P, N -> P if P points to a type of zero size. if (TyAllocSize == 0 && Ops[0]->getType() == GEPTy) return Ops[0]; // The following transforms are only safe if the ptrtoint cast // doesn't truncate the pointers. if (Ops[1]->getType()->getScalarSizeInBits() == Q.DL.getIndexSizeInBits(AS)) { auto PtrToIntOrZero = [GEPTy](Value *P) -> Value * { if (match(P, m_Zero())) return Constant::getNullValue(GEPTy); Value *Temp; if (match(P, m_PtrToInt(m_Value(Temp)))) if (Temp->getType() == GEPTy) return Temp; return nullptr; }; // getelementptr V, (sub P, V) -> P if P points to a type of size 1. if (TyAllocSize == 1 && match(Ops[1], m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))))) if (Value *R = PtrToIntOrZero(P)) return R; // getelementptr V, (ashr (sub P, V), C) -> Q // if P points to a type of size 1 << C. if (match(Ops[1], m_AShr(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))), m_ConstantInt(C))) && TyAllocSize == 1ULL << C) if (Value *R = PtrToIntOrZero(P)) return R; // getelementptr V, (sdiv (sub P, V), C) -> Q // if P points to a type of size C. if (match(Ops[1], m_SDiv(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))), m_SpecificInt(TyAllocSize)))) if (Value *R = PtrToIntOrZero(P)) return R; } } } if (Q.DL.getTypeAllocSize(LastType) == 1 && all_of(Ops.slice(1).drop_back(1), [](Value *Idx) { return match(Idx, m_Zero()); })) { unsigned IdxWidth = Q.DL.getIndexSizeInBits(Ops[0]->getType()->getPointerAddressSpace()); if (Q.DL.getTypeSizeInBits(Ops.back()->getType()) == IdxWidth) { APInt BasePtrOffset(IdxWidth, 0); Value *StrippedBasePtr = Ops[0]->stripAndAccumulateInBoundsConstantOffsets(Q.DL, BasePtrOffset); // gep (gep V, C), (sub 0, V) -> C if (match(Ops.back(), m_Sub(m_Zero(), m_PtrToInt(m_Specific(StrippedBasePtr))))) { auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset); return ConstantExpr::getIntToPtr(CI, GEPTy); } // gep (gep V, C), (xor V, -1) -> C-1 if (match(Ops.back(), m_Xor(m_PtrToInt(m_Specific(StrippedBasePtr)), m_AllOnes()))) { auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset - 1); return ConstantExpr::getIntToPtr(CI, GEPTy); } } } // Check to see if this is constant foldable. if (!all_of(Ops, [](Value *V) { return isa(V); })) return nullptr; auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast(Ops[0]), Ops.slice(1)); if (auto *CEFolded = ConstantFoldConstant(CE, Q.DL)) return CEFolded; return CE; } Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, const SimplifyQuery &Q) { return ::SimplifyGEPInst(SrcTy, Ops, Q, RecursionLimit); } /// Given operands for an InsertValueInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyInsertValueInst(Value *Agg, Value *Val, ArrayRef Idxs, const SimplifyQuery &Q, unsigned) { if (Constant *CAgg = dyn_cast(Agg)) if (Constant *CVal = dyn_cast(Val)) return ConstantFoldInsertValueInstruction(CAgg, CVal, Idxs); // insertvalue x, undef, n -> x if (match(Val, m_Undef())) return Agg; // insertvalue x, (extractvalue y, n), n if (ExtractValueInst *EV = dyn_cast(Val)) if (EV->getAggregateOperand()->getType() == Agg->getType() && EV->getIndices() == Idxs) { // insertvalue undef, (extractvalue y, n), n -> y if (match(Agg, m_Undef())) return EV->getAggregateOperand(); // insertvalue y, (extractvalue y, n), n -> y if (Agg == EV->getAggregateOperand()) return Agg; } return nullptr; } Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val, ArrayRef Idxs, const SimplifyQuery &Q) { return ::SimplifyInsertValueInst(Agg, Val, Idxs, Q, RecursionLimit); } Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx, const SimplifyQuery &Q) { // Try to constant fold. auto *VecC = dyn_cast(Vec); auto *ValC = dyn_cast(Val); auto *IdxC = dyn_cast(Idx); if (VecC && ValC && IdxC) return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC); // Fold into undef if index is out of bounds. if (auto *CI = dyn_cast(Idx)) { uint64_t NumElements = cast(Vec->getType())->getNumElements(); if (CI->uge(NumElements)) return UndefValue::get(Vec->getType()); } // If index is undef, it might be out of bounds (see above case) if (isa(Idx)) return UndefValue::get(Vec->getType()); // Inserting an undef scalar? Assume it is the same value as the existing // vector element. if (isa(Val)) return Vec; // If we are extracting a value from a vector, then inserting it into the same // place, that's the input vector: // insertelt Vec, (extractelt Vec, Idx), Idx --> Vec if (match(Val, m_ExtractElement(m_Specific(Vec), m_Specific(Idx)))) return Vec; return nullptr; } /// Given operands for an ExtractValueInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, const SimplifyQuery &, unsigned) { if (auto *CAgg = dyn_cast(Agg)) return ConstantFoldExtractValueInstruction(CAgg, Idxs); // extractvalue x, (insertvalue y, elt, n), n -> elt unsigned NumIdxs = Idxs.size(); for (auto *IVI = dyn_cast(Agg); IVI != nullptr; IVI = dyn_cast(IVI->getAggregateOperand())) { ArrayRef InsertValueIdxs = IVI->getIndices(); unsigned NumInsertValueIdxs = InsertValueIdxs.size(); unsigned NumCommonIdxs = std::min(NumInsertValueIdxs, NumIdxs); if (InsertValueIdxs.slice(0, NumCommonIdxs) == Idxs.slice(0, NumCommonIdxs)) { if (NumIdxs == NumInsertValueIdxs) return IVI->getInsertedValueOperand(); break; } } return nullptr; } Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, const SimplifyQuery &Q) { return ::SimplifyExtractValueInst(Agg, Idxs, Q, RecursionLimit); } /// Given operands for an ExtractElementInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &, unsigned) { if (auto *CVec = dyn_cast(Vec)) { if (auto *CIdx = dyn_cast(Idx)) return ConstantFoldExtractElementInstruction(CVec, CIdx); // The index is not relevant if our vector is a splat. if (auto *Splat = CVec->getSplatValue()) return Splat; if (isa(Vec)) return UndefValue::get(Vec->getType()->getVectorElementType()); } // If extracting a specified index from the vector, see if we can recursively // find a previously computed scalar that was inserted into the vector. if (auto *IdxC = dyn_cast(Idx)) { if (IdxC->getValue().uge(Vec->getType()->getVectorNumElements())) // definitely out of bounds, thus undefined result return UndefValue::get(Vec->getType()->getVectorElementType()); if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue())) return Elt; } // An undef extract index can be arbitrarily chosen to be an out-of-range // index value, which would result in the instruction being undef. if (isa(Idx)) return UndefValue::get(Vec->getType()->getVectorElementType()); return nullptr; } Value *llvm::SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &Q) { return ::SimplifyExtractElementInst(Vec, Idx, Q, RecursionLimit); } /// See if we can fold the given phi. If not, returns null. static Value *SimplifyPHINode(PHINode *PN, const SimplifyQuery &Q) { // If all of the PHI's incoming values are the same then replace the PHI node // with the common value. Value *CommonValue = nullptr; bool HasUndefInput = false; for (Value *Incoming : PN->incoming_values()) { // If the incoming value is the phi node itself, it can safely be skipped. if (Incoming == PN) continue; if (isa(Incoming)) { // Remember that we saw an undef value, but otherwise ignore them. HasUndefInput = true; continue; } if (CommonValue && Incoming != CommonValue) return nullptr; // Not the same, bail out. CommonValue = Incoming; } // If CommonValue is null then all of the incoming values were either undef or // equal to the phi node itself. if (!CommonValue) return UndefValue::get(PN->getType()); // If we have a PHI node like phi(X, undef, X), where X is defined by some // instruction, we cannot return X as the result of the PHI node unless it // dominates the PHI block. if (HasUndefInput) return valueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : nullptr; return CommonValue; } static Value *SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, const SimplifyQuery &Q, unsigned MaxRecurse) { if (auto *C = dyn_cast(Op)) return ConstantFoldCastOperand(CastOpc, C, Ty, Q.DL); if (auto *CI = dyn_cast(Op)) { auto *Src = CI->getOperand(0); Type *SrcTy = Src->getType(); Type *MidTy = CI->getType(); Type *DstTy = Ty; if (Src->getType() == Ty) { auto FirstOp = static_cast(CI->getOpcode()); auto SecondOp = static_cast(CastOpc); Type *SrcIntPtrTy = SrcTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(SrcTy) : nullptr; Type *MidIntPtrTy = MidTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(MidTy) : nullptr; Type *DstIntPtrTy = DstTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(DstTy) : nullptr; if (CastInst::isEliminableCastPair(FirstOp, SecondOp, SrcTy, MidTy, DstTy, SrcIntPtrTy, MidIntPtrTy, DstIntPtrTy) == Instruction::BitCast) return Src; } } // bitcast x -> x if (CastOpc == Instruction::BitCast) if (Op->getType() == Ty) return Op; return nullptr; } Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, const SimplifyQuery &Q) { return ::SimplifyCastInst(CastOpc, Op, Ty, Q, RecursionLimit); } /// For the given destination element of a shuffle, peek through shuffles to /// match a root vector source operand that contains that element in the same /// vector lane (ie, the same mask index), so we can eliminate the shuffle(s). static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1, int MaskVal, Value *RootVec, unsigned MaxRecurse) { if (!MaxRecurse--) return nullptr; // Bail out if any mask value is undefined. That kind of shuffle may be // simplified further based on demanded bits or other folds. if (MaskVal == -1) return nullptr; // The mask value chooses which source operand we need to look at next. int InVecNumElts = Op0->getType()->getVectorNumElements(); int RootElt = MaskVal; Value *SourceOp = Op0; if (MaskVal >= InVecNumElts) { RootElt = MaskVal - InVecNumElts; SourceOp = Op1; } // If the source operand is a shuffle itself, look through it to find the // matching root vector. if (auto *SourceShuf = dyn_cast(SourceOp)) { return foldIdentityShuffles( DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1), SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse); } // TODO: Look through bitcasts? What if the bitcast changes the vector element // size? // The source operand is not a shuffle. Initialize the root vector value for // this shuffle if that has not been done yet. if (!RootVec) RootVec = SourceOp; // Give up as soon as a source operand does not match the existing root value. if (RootVec != SourceOp) return nullptr; // The element must be coming from the same lane in the source vector // (although it may have crossed lanes in intermediate shuffles). if (RootElt != DestElt) return nullptr; return RootVec; } static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, Type *RetTy, const SimplifyQuery &Q, unsigned MaxRecurse) { if (isa(Mask)) return UndefValue::get(RetTy); Type *InVecTy = Op0->getType(); unsigned MaskNumElts = Mask->getType()->getVectorNumElements(); unsigned InVecNumElts = InVecTy->getVectorNumElements(); SmallVector Indices; ShuffleVectorInst::getShuffleMask(Mask, Indices); assert(MaskNumElts == Indices.size() && "Size of Indices not same as number of mask elements?"); // Canonicalization: If mask does not select elements from an input vector, // replace that input vector with undef. bool MaskSelects0 = false, MaskSelects1 = false; for (unsigned i = 0; i != MaskNumElts; ++i) { if (Indices[i] == -1) continue; if ((unsigned)Indices[i] < InVecNumElts) MaskSelects0 = true; else MaskSelects1 = true; } if (!MaskSelects0) Op0 = UndefValue::get(InVecTy); if (!MaskSelects1) Op1 = UndefValue::get(InVecTy); auto *Op0Const = dyn_cast(Op0); auto *Op1Const = dyn_cast(Op1); // If all operands are constant, constant fold the shuffle. if (Op0Const && Op1Const) return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask); // Canonicalization: if only one input vector is constant, it shall be the // second one. if (Op0Const && !Op1Const) { std::swap(Op0, Op1); ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts); } // A shuffle of a splat is always the splat itself. Legal if the shuffle's // value type is same as the input vectors' type. if (auto *OpShuf = dyn_cast(Op0)) if (isa(Op1) && RetTy == InVecTy && OpShuf->getMask()->getSplatValue()) return Op0; // Don't fold a shuffle with undef mask elements. This may get folded in a // better way using demanded bits or other analysis. // TODO: Should we allow this? if (find(Indices, -1) != Indices.end()) return nullptr; // Check if every element of this shuffle can be mapped back to the // corresponding element of a single root vector. If so, we don't need this // shuffle. This handles simple identity shuffles as well as chains of // shuffles that may widen/narrow and/or move elements across lanes and back. Value *RootVec = nullptr; for (unsigned i = 0; i != MaskNumElts; ++i) { // Note that recursion is limited for each vector element, so if any element // exceeds the limit, this will fail to simplify. RootVec = foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse); // We can't replace a widening/narrowing shuffle with one of its operands. if (!RootVec || RootVec->getType() != RetTy) return nullptr; } return RootVec; } /// Given operands for a ShuffleVectorInst, fold the result or return null. Value *llvm::SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, Type *RetTy, const SimplifyQuery &Q) { return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit); } static Constant *foldConstant(Instruction::UnaryOps Opcode, Value *&Op, const SimplifyQuery &Q) { if (auto *C = dyn_cast(Op)) return ConstantFoldUnaryOpOperand(Opcode, C, Q.DL); return nullptr; } /// Given the operand for an FNeg, see if we can fold the result. If not, this /// returns null. static Value *simplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldConstant(Instruction::FNeg, Op, Q)) return C; Value *X; // fneg (fneg X) ==> X if (match(Op, m_FNeg(m_Value(X)))) return X; return nullptr; } Value *llvm::SimplifyFNegInst(Value *Op, FastMathFlags FMF, const SimplifyQuery &Q) { return ::simplifyFNegInst(Op, FMF, Q, RecursionLimit); } static Constant *propagateNaN(Constant *In) { // If the input is a vector with undef elements, just return a default NaN. if (!In->isNaN()) return ConstantFP::getNaN(In->getType()); // Propagate the existing NaN constant when possible. // TODO: Should we quiet a signaling NaN? return In; } static Constant *simplifyFPBinop(Value *Op0, Value *Op1) { if (isa(Op0) || isa(Op1)) return ConstantFP::getNaN(Op0->getType()); if (match(Op0, m_NaN())) return propagateNaN(cast(Op0)); if (match(Op1, m_NaN())) return propagateNaN(cast(Op1)); return nullptr; } /// Given operands for an FAdd, see if we can fold the result. If not, this /// returns null. static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q)) return C; if (Constant *C = simplifyFPBinop(Op0, Op1)) return C; // fadd X, -0 ==> X if (match(Op1, m_NegZeroFP())) return Op0; // fadd X, 0 ==> X, when we know X is not -0 if (match(Op1, m_PosZeroFP()) && (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI))) return Op0; // With nnan: -X + X --> 0.0 (and commuted variant) // We don't have to explicitly exclude infinities (ninf): INF + -INF == NaN. // Negative zeros are allowed because we always end up with positive zero: // X = -0.0: (-0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0 // X = -0.0: ( 0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0 // X = 0.0: (-0.0 - ( 0.0)) + ( 0.0) == (-0.0) + ( 0.0) == 0.0 // X = 0.0: ( 0.0 - ( 0.0)) + ( 0.0) == ( 0.0) + ( 0.0) == 0.0 if (FMF.noNaNs()) { if (match(Op0, m_FSub(m_AnyZeroFP(), m_Specific(Op1))) || match(Op1, m_FSub(m_AnyZeroFP(), m_Specific(Op0)))) return ConstantFP::getNullValue(Op0->getType()); if (match(Op0, m_FNeg(m_Specific(Op1))) || match(Op1, m_FNeg(m_Specific(Op0)))) return ConstantFP::getNullValue(Op0->getType()); } // (X - Y) + Y --> X // Y + (X - Y) --> X Value *X; if (FMF.noSignedZeros() && FMF.allowReassoc() && (match(Op0, m_FSub(m_Value(X), m_Specific(Op1))) || match(Op1, m_FSub(m_Value(X), m_Specific(Op0))))) return X; return nullptr; } /// Given operands for an FSub, see if we can fold the result. If not, this /// returns null. static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q)) return C; if (Constant *C = simplifyFPBinop(Op0, Op1)) return C; // fsub X, +0 ==> X if (match(Op1, m_PosZeroFP())) return Op0; // fsub X, -0 ==> X, when we know X is not -0 if (match(Op1, m_NegZeroFP()) && (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI))) return Op0; // fsub -0.0, (fsub -0.0, X) ==> X // fsub -0.0, (fneg X) ==> X Value *X; if (match(Op0, m_NegZeroFP()) && match(Op1, m_FNeg(m_Value(X)))) return X; // fsub 0.0, (fsub 0.0, X) ==> X if signed zeros are ignored. // fsub 0.0, (fneg X) ==> X if signed zeros are ignored. if (FMF.noSignedZeros() && match(Op0, m_AnyZeroFP()) && (match(Op1, m_FSub(m_AnyZeroFP(), m_Value(X))) || match(Op1, m_FNeg(m_Value(X))))) return X; // fsub nnan x, x ==> 0.0 if (FMF.noNaNs() && Op0 == Op1) return Constant::getNullValue(Op0->getType()); // Y - (Y - X) --> X // (X + Y) - Y --> X if (FMF.noSignedZeros() && FMF.allowReassoc() && (match(Op1, m_FSub(m_Specific(Op0), m_Value(X))) || match(Op0, m_c_FAdd(m_Specific(Op1), m_Value(X))))) return X; return nullptr; } /// Given the operands for an FMul, see if we can fold the result static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q)) return C; if (Constant *C = simplifyFPBinop(Op0, Op1)) return C; // fmul X, 1.0 ==> X if (match(Op1, m_FPOne())) return Op0; // fmul nnan nsz X, 0 ==> 0 if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZeroFP())) return ConstantFP::getNullValue(Op0->getType()); // sqrt(X) * sqrt(X) --> X, if we can: // 1. Remove the intermediate rounding (reassociate). // 2. Ignore non-zero negative numbers because sqrt would produce NAN. // 3. Ignore -0.0 because sqrt(-0.0) == -0.0, but -0.0 * -0.0 == 0.0. Value *X; if (Op0 == Op1 && match(Op0, m_Intrinsic(m_Value(X))) && FMF.allowReassoc() && FMF.noNaNs() && FMF.noSignedZeros()) return X; return nullptr; } Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q) { return ::SimplifyFAddInst(Op0, Op1, FMF, Q, RecursionLimit); } Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q) { return ::SimplifyFSubInst(Op0, Op1, FMF, Q, RecursionLimit); } Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q) { return ::SimplifyFMulInst(Op0, Op1, FMF, Q, RecursionLimit); } static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q, unsigned) { if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q)) return C; if (Constant *C = simplifyFPBinop(Op0, Op1)) return C; // X / 1.0 -> X if (match(Op1, m_FPOne())) return Op0; // 0 / X -> 0 // Requires that NaNs are off (X could be zero) and signed zeroes are // ignored (X could be positive or negative, so the output sign is unknown). if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op0, m_AnyZeroFP())) return ConstantFP::getNullValue(Op0->getType()); if (FMF.noNaNs()) { // X / X -> 1.0 is legal when NaNs are ignored. // We can ignore infinities because INF/INF is NaN. if (Op0 == Op1) return ConstantFP::get(Op0->getType(), 1.0); // (X * Y) / Y --> X if we can reassociate to the above form. Value *X; if (FMF.allowReassoc() && match(Op0, m_c_FMul(m_Value(X), m_Specific(Op1)))) return X; // -X / X -> -1.0 and // X / -X -> -1.0 are legal when NaNs are ignored. // We can ignore signed zeros because +-0.0/+-0.0 is NaN and ignored. if (match(Op0, m_FNegNSZ(m_Specific(Op1))) || match(Op1, m_FNegNSZ(m_Specific(Op0)))) return ConstantFP::get(Op0->getType(), -1.0); } return nullptr; } Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q) { return ::SimplifyFDivInst(Op0, Op1, FMF, Q, RecursionLimit); } static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q, unsigned) { if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q)) return C; if (Constant *C = simplifyFPBinop(Op0, Op1)) return C; // Unlike fdiv, the result of frem always matches the sign of the dividend. // The constant match may include undef elements in a vector, so return a full // zero constant as the result. if (FMF.noNaNs()) { // +0 % X -> 0 if (match(Op0, m_PosZeroFP())) return ConstantFP::getNullValue(Op0->getType()); // -0 % X -> -0 if (match(Op0, m_NegZeroFP())) return ConstantFP::getNegativeZero(Op0->getType()); } return nullptr; } Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q) { return ::SimplifyFRemInst(Op0, Op1, FMF, Q, RecursionLimit); } //=== Helper functions for higher up the class hierarchy. /// Given the operand for a UnaryOperator, see if we can fold the result. /// If not, this returns null. static Value *simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q, unsigned MaxRecurse) { switch (Opcode) { case Instruction::FNeg: return simplifyFNegInst(Op, FastMathFlags(), Q, MaxRecurse); default: llvm_unreachable("Unexpected opcode"); } } /// Given the operand for a UnaryOperator, see if we can fold the result. /// If not, this returns null. /// In contrast to SimplifyUnOp, try to use FastMathFlag when folding the /// result. In case we don't need FastMathFlags, simply fall to SimplifyUnOp. static Value *simplifyFPUnOp(unsigned Opcode, Value *Op, const FastMathFlags &FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { switch (Opcode) { case Instruction::FNeg: return simplifyFNegInst(Op, FMF, Q, MaxRecurse); default: return simplifyUnOp(Opcode, Op, Q, MaxRecurse); } } Value *llvm::SimplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q) { return ::simplifyUnOp(Opcode, Op, Q, RecursionLimit); } Value *llvm::SimplifyFPUnOp(unsigned Opcode, Value *Op, FastMathFlags FMF, const SimplifyQuery &Q) { return ::simplifyFPUnOp(Opcode, Op, FMF, Q, RecursionLimit); } /// Given operands for a BinaryOperator, see if we can fold the result. /// If not, this returns null. static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { switch (Opcode) { case Instruction::Add: return SimplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse); case Instruction::Sub: return SimplifySubInst(LHS, RHS, false, false, Q, MaxRecurse); case Instruction::Mul: return SimplifyMulInst(LHS, RHS, Q, MaxRecurse); case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, Q, MaxRecurse); case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse); case Instruction::SRem: return SimplifySRemInst(LHS, RHS, Q, MaxRecurse); case Instruction::URem: return SimplifyURemInst(LHS, RHS, Q, MaxRecurse); case Instruction::Shl: return SimplifyShlInst(LHS, RHS, false, false, Q, MaxRecurse); case Instruction::LShr: return SimplifyLShrInst(LHS, RHS, false, Q, MaxRecurse); case Instruction::AShr: return SimplifyAShrInst(LHS, RHS, false, Q, MaxRecurse); case Instruction::And: return SimplifyAndInst(LHS, RHS, Q, MaxRecurse); case Instruction::Or: return SimplifyOrInst(LHS, RHS, Q, MaxRecurse); case Instruction::Xor: return SimplifyXorInst(LHS, RHS, Q, MaxRecurse); case Instruction::FAdd: return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse); case Instruction::FSub: return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse); case Instruction::FMul: return SimplifyFMulInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse); case Instruction::FDiv: return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse); case Instruction::FRem: return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse); default: llvm_unreachable("Unexpected opcode"); } } /// Given operands for a BinaryOperator, see if we can fold the result. /// If not, this returns null. /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp. static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, const FastMathFlags &FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { switch (Opcode) { case Instruction::FAdd: return SimplifyFAddInst(LHS, RHS, FMF, Q, MaxRecurse); case Instruction::FSub: return SimplifyFSubInst(LHS, RHS, FMF, Q, MaxRecurse); case Instruction::FMul: return SimplifyFMulInst(LHS, RHS, FMF, Q, MaxRecurse); case Instruction::FDiv: return SimplifyFDivInst(LHS, RHS, FMF, Q, MaxRecurse); default: return SimplifyBinOp(Opcode, LHS, RHS, Q, MaxRecurse); } } Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q) { return ::SimplifyBinOp(Opcode, LHS, RHS, Q, RecursionLimit); } Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, FastMathFlags FMF, const SimplifyQuery &Q) { return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit); } /// Given operands for a CmpInst, see if we can fold the result. static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate)) return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse); return SimplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse); } Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q) { return ::SimplifyCmpInst(Predicate, LHS, RHS, Q, RecursionLimit); } static bool IsIdempotent(Intrinsic::ID ID) { switch (ID) { default: return false; // Unary idempotent: f(f(x)) = f(x) case Intrinsic::fabs: case Intrinsic::floor: case Intrinsic::ceil: case Intrinsic::trunc: case Intrinsic::rint: case Intrinsic::nearbyint: case Intrinsic::round: case Intrinsic::canonicalize: return true; } } static Value *SimplifyRelativeLoad(Constant *Ptr, Constant *Offset, const DataLayout &DL) { GlobalValue *PtrSym; APInt PtrOffset; if (!IsConstantOffsetFromGlobal(Ptr, PtrSym, PtrOffset, DL)) return nullptr; Type *Int8PtrTy = Type::getInt8PtrTy(Ptr->getContext()); Type *Int32Ty = Type::getInt32Ty(Ptr->getContext()); Type *Int32PtrTy = Int32Ty->getPointerTo(); Type *Int64Ty = Type::getInt64Ty(Ptr->getContext()); auto *OffsetConstInt = dyn_cast(Offset); if (!OffsetConstInt || OffsetConstInt->getType()->getBitWidth() > 64) return nullptr; uint64_t OffsetInt = OffsetConstInt->getSExtValue(); if (OffsetInt % 4 != 0) return nullptr; Constant *C = ConstantExpr::getGetElementPtr( Int32Ty, ConstantExpr::getBitCast(Ptr, Int32PtrTy), ConstantInt::get(Int64Ty, OffsetInt / 4)); Constant *Loaded = ConstantFoldLoadFromConstPtr(C, Int32Ty, DL); if (!Loaded) return nullptr; auto *LoadedCE = dyn_cast(Loaded); if (!LoadedCE) return nullptr; if (LoadedCE->getOpcode() == Instruction::Trunc) { LoadedCE = dyn_cast(LoadedCE->getOperand(0)); if (!LoadedCE) return nullptr; } if (LoadedCE->getOpcode() != Instruction::Sub) return nullptr; auto *LoadedLHS = dyn_cast(LoadedCE->getOperand(0)); if (!LoadedLHS || LoadedLHS->getOpcode() != Instruction::PtrToInt) return nullptr; auto *LoadedLHSPtr = LoadedLHS->getOperand(0); Constant *LoadedRHS = LoadedCE->getOperand(1); GlobalValue *LoadedRHSSym; APInt LoadedRHSOffset; if (!IsConstantOffsetFromGlobal(LoadedRHS, LoadedRHSSym, LoadedRHSOffset, DL) || PtrSym != LoadedRHSSym || PtrOffset != LoadedRHSOffset) return nullptr; return ConstantExpr::getBitCast(LoadedLHSPtr, Int8PtrTy); } static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0, const SimplifyQuery &Q) { // Idempotent functions return the same result when called repeatedly. Intrinsic::ID IID = F->getIntrinsicID(); if (IsIdempotent(IID)) if (auto *II = dyn_cast(Op0)) if (II->getIntrinsicID() == IID) return II; Value *X; switch (IID) { case Intrinsic::fabs: if (SignBitMustBeZero(Op0, Q.TLI)) return Op0; break; case Intrinsic::bswap: // bswap(bswap(x)) -> x if (match(Op0, m_BSwap(m_Value(X)))) return X; break; case Intrinsic::bitreverse: // bitreverse(bitreverse(x)) -> x if (match(Op0, m_BitReverse(m_Value(X)))) return X; break; case Intrinsic::exp: // exp(log(x)) -> x if (Q.CxtI->hasAllowReassoc() && match(Op0, m_Intrinsic(m_Value(X)))) return X; break; case Intrinsic::exp2: // exp2(log2(x)) -> x if (Q.CxtI->hasAllowReassoc() && match(Op0, m_Intrinsic(m_Value(X)))) return X; break; case Intrinsic::log: // log(exp(x)) -> x if (Q.CxtI->hasAllowReassoc() && match(Op0, m_Intrinsic(m_Value(X)))) return X; break; case Intrinsic::log2: // log2(exp2(x)) -> x if (Q.CxtI->hasAllowReassoc() && (match(Op0, m_Intrinsic(m_Value(X))) || match(Op0, m_Intrinsic(m_SpecificFP(2.0), m_Value(X))))) return X; break; case Intrinsic::log10: // log10(pow(10.0, x)) -> x if (Q.CxtI->hasAllowReassoc() && match(Op0, m_Intrinsic(m_SpecificFP(10.0), m_Value(X)))) return X; break; case Intrinsic::floor: case Intrinsic::trunc: case Intrinsic::ceil: case Intrinsic::round: case Intrinsic::nearbyint: case Intrinsic::rint: { // floor (sitofp x) -> sitofp x // floor (uitofp x) -> uitofp x // // Converting from int always results in a finite integral number or // infinity. For either of those inputs, these rounding functions always // return the same value, so the rounding can be eliminated. if (match(Op0, m_SIToFP(m_Value())) || match(Op0, m_UIToFP(m_Value()))) return Op0; break; } default: break; } return nullptr; } static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1, const SimplifyQuery &Q) { Intrinsic::ID IID = F->getIntrinsicID(); Type *ReturnType = F->getReturnType(); switch (IID) { case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: // X - X -> { 0, false } if (Op0 == Op1) return Constant::getNullValue(ReturnType); LLVM_FALLTHROUGH; case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: // X - undef -> { undef, false } // undef - X -> { undef, false } // X + undef -> { undef, false } // undef + x -> { undef, false } if (isa(Op0) || isa(Op1)) { return ConstantStruct::get( cast(ReturnType), {UndefValue::get(ReturnType->getStructElementType(0)), Constant::getNullValue(ReturnType->getStructElementType(1))}); } break; case Intrinsic::umul_with_overflow: case Intrinsic::smul_with_overflow: // 0 * X -> { 0, false } // X * 0 -> { 0, false } if (match(Op0, m_Zero()) || match(Op1, m_Zero())) return Constant::getNullValue(ReturnType); // undef * X -> { 0, false } // X * undef -> { 0, false } if (match(Op0, m_Undef()) || match(Op1, m_Undef())) return Constant::getNullValue(ReturnType); break; case Intrinsic::uadd_sat: // sat(MAX + X) -> MAX // sat(X + MAX) -> MAX if (match(Op0, m_AllOnes()) || match(Op1, m_AllOnes())) return Constant::getAllOnesValue(ReturnType); LLVM_FALLTHROUGH; case Intrinsic::sadd_sat: // sat(X + undef) -> -1 // sat(undef + X) -> -1 // For unsigned: Assume undef is MAX, thus we saturate to MAX (-1). // For signed: Assume undef is ~X, in which case X + ~X = -1. if (match(Op0, m_Undef()) || match(Op1, m_Undef())) return Constant::getAllOnesValue(ReturnType); // X + 0 -> X if (match(Op1, m_Zero())) return Op0; // 0 + X -> X if (match(Op0, m_Zero())) return Op1; break; case Intrinsic::usub_sat: // sat(0 - X) -> 0, sat(X - MAX) -> 0 if (match(Op0, m_Zero()) || match(Op1, m_AllOnes())) return Constant::getNullValue(ReturnType); LLVM_FALLTHROUGH; case Intrinsic::ssub_sat: // X - X -> 0, X - undef -> 0, undef - X -> 0 if (Op0 == Op1 || match(Op0, m_Undef()) || match(Op1, m_Undef())) return Constant::getNullValue(ReturnType); // X - 0 -> X if (match(Op1, m_Zero())) return Op0; break; case Intrinsic::load_relative: if (auto *C0 = dyn_cast(Op0)) if (auto *C1 = dyn_cast(Op1)) return SimplifyRelativeLoad(C0, C1, Q.DL); break; case Intrinsic::powi: if (auto *Power = dyn_cast(Op1)) { // powi(x, 0) -> 1.0 if (Power->isZero()) return ConstantFP::get(Op0->getType(), 1.0); // powi(x, 1) -> x if (Power->isOne()) return Op0; } break; case Intrinsic::maxnum: case Intrinsic::minnum: case Intrinsic::maximum: case Intrinsic::minimum: { // If the arguments are the same, this is a no-op. if (Op0 == Op1) return Op0; // If one argument is undef, return the other argument. if (match(Op0, m_Undef())) return Op1; if (match(Op1, m_Undef())) return Op0; // If one argument is NaN, return other or NaN appropriately. bool PropagateNaN = IID == Intrinsic::minimum || IID == Intrinsic::maximum; if (match(Op0, m_NaN())) return PropagateNaN ? Op0 : Op1; if (match(Op1, m_NaN())) return PropagateNaN ? Op1 : Op0; // Min/max of the same operation with common operand: // m(m(X, Y)), X --> m(X, Y) (4 commuted variants) if (auto *M0 = dyn_cast(Op0)) if (M0->getIntrinsicID() == IID && (M0->getOperand(0) == Op1 || M0->getOperand(1) == Op1)) return Op0; if (auto *M1 = dyn_cast(Op1)) if (M1->getIntrinsicID() == IID && (M1->getOperand(0) == Op0 || M1->getOperand(1) == Op0)) return Op1; // min(X, -Inf) --> -Inf (and commuted variant) // max(X, +Inf) --> +Inf (and commuted variant) bool UseNegInf = IID == Intrinsic::minnum || IID == Intrinsic::minimum; const APFloat *C; if ((match(Op0, m_APFloat(C)) && C->isInfinity() && C->isNegative() == UseNegInf) || (match(Op1, m_APFloat(C)) && C->isInfinity() && C->isNegative() == UseNegInf)) return ConstantFP::getInfinity(ReturnType, UseNegInf); // TODO: minnum(nnan x, inf) -> x // TODO: minnum(nnan ninf x, flt_max) -> x // TODO: maxnum(nnan x, -inf) -> x // TODO: maxnum(nnan ninf x, -flt_max) -> x break; } default: break; } return nullptr; } static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) { // Intrinsics with no operands have some kind of side effect. Don't simplify. unsigned NumOperands = Call->getNumArgOperands(); if (!NumOperands) return nullptr; Function *F = cast(Call->getCalledFunction()); Intrinsic::ID IID = F->getIntrinsicID(); if (NumOperands == 1) return simplifyUnaryIntrinsic(F, Call->getArgOperand(0), Q); if (NumOperands == 2) return simplifyBinaryIntrinsic(F, Call->getArgOperand(0), Call->getArgOperand(1), Q); // Handle intrinsics with 3 or more arguments. switch (IID) { case Intrinsic::masked_load: case Intrinsic::masked_gather: { Value *MaskArg = Call->getArgOperand(2); Value *PassthruArg = Call->getArgOperand(3); // If the mask is all zeros or undef, the "passthru" argument is the result. if (maskIsAllZeroOrUndef(MaskArg)) return PassthruArg; return nullptr; } case Intrinsic::fshl: case Intrinsic::fshr: { Value *Op0 = Call->getArgOperand(0), *Op1 = Call->getArgOperand(1), *ShAmtArg = Call->getArgOperand(2); // If both operands are undef, the result is undef. if (match(Op0, m_Undef()) && match(Op1, m_Undef())) return UndefValue::get(F->getReturnType()); // If shift amount is undef, assume it is zero. if (match(ShAmtArg, m_Undef())) return Call->getArgOperand(IID == Intrinsic::fshl ? 0 : 1); const APInt *ShAmtC; if (match(ShAmtArg, m_APInt(ShAmtC))) { // If there's effectively no shift, return the 1st arg or 2nd arg. APInt BitWidth = APInt(ShAmtC->getBitWidth(), ShAmtC->getBitWidth()); if (ShAmtC->urem(BitWidth).isNullValue()) return Call->getArgOperand(IID == Intrinsic::fshl ? 0 : 1); } return nullptr; } default: return nullptr; } } Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) { Value *Callee = Call->getCalledValue(); // call undef -> undef // call null -> undef if (isa(Callee) || isa(Callee)) return UndefValue::get(Call->getType()); Function *F = dyn_cast(Callee); if (!F) return nullptr; if (F->isIntrinsic()) if (Value *Ret = simplifyIntrinsic(Call, Q)) return Ret; if (!canConstantFoldCallTo(Call, F)) return nullptr; SmallVector ConstantArgs; unsigned NumArgs = Call->getNumArgOperands(); ConstantArgs.reserve(NumArgs); for (auto &Arg : Call->args()) { Constant *C = dyn_cast(&Arg); if (!C) return nullptr; ConstantArgs.push_back(C); } return ConstantFoldCall(Call, F, ConstantArgs, Q.TLI); } /// See if we can compute a simplified version of this instruction. /// If not, this returns null. Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ, OptimizationRemarkEmitter *ORE) { const SimplifyQuery Q = SQ.CxtI ? SQ : SQ.getWithInstruction(I); Value *Result; switch (I->getOpcode()) { default: Result = ConstantFoldInstruction(I, Q.DL, Q.TLI); break; case Instruction::FNeg: Result = SimplifyFNegInst(I->getOperand(0), I->getFastMathFlags(), Q); break; case Instruction::FAdd: Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1), I->getFastMathFlags(), Q); break; case Instruction::Add: Result = SimplifyAddInst(I->getOperand(0), I->getOperand(1), Q.IIQ.hasNoSignedWrap(cast(I)), Q.IIQ.hasNoUnsignedWrap(cast(I)), Q); break; case Instruction::FSub: Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1), I->getFastMathFlags(), Q); break; case Instruction::Sub: Result = SimplifySubInst(I->getOperand(0), I->getOperand(1), Q.IIQ.hasNoSignedWrap(cast(I)), Q.IIQ.hasNoUnsignedWrap(cast(I)), Q); break; case Instruction::FMul: Result = SimplifyFMulInst(I->getOperand(0), I->getOperand(1), I->getFastMathFlags(), Q); break; case Instruction::Mul: Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::SDiv: Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::UDiv: Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::FDiv: Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1), I->getFastMathFlags(), Q); break; case Instruction::SRem: Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::URem: Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::FRem: Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1), I->getFastMathFlags(), Q); break; case Instruction::Shl: Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1), Q.IIQ.hasNoSignedWrap(cast(I)), Q.IIQ.hasNoUnsignedWrap(cast(I)), Q); break; case Instruction::LShr: Result = SimplifyLShrInst(I->getOperand(0), I->getOperand(1), Q.IIQ.isExact(cast(I)), Q); break; case Instruction::AShr: Result = SimplifyAShrInst(I->getOperand(0), I->getOperand(1), Q.IIQ.isExact(cast(I)), Q); break; case Instruction::And: Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::Or: Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::Xor: Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::ICmp: Result = SimplifyICmpInst(cast(I)->getPredicate(), I->getOperand(0), I->getOperand(1), Q); break; case Instruction::FCmp: Result = SimplifyFCmpInst(cast(I)->getPredicate(), I->getOperand(0), I->getOperand(1), I->getFastMathFlags(), Q); break; case Instruction::Select: Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1), I->getOperand(2), Q); break; case Instruction::GetElementPtr: { SmallVector Ops(I->op_begin(), I->op_end()); Result = SimplifyGEPInst(cast(I)->getSourceElementType(), Ops, Q); break; } case Instruction::InsertValue: { InsertValueInst *IV = cast(I); Result = SimplifyInsertValueInst(IV->getAggregateOperand(), IV->getInsertedValueOperand(), IV->getIndices(), Q); break; } case Instruction::InsertElement: { auto *IE = cast(I); Result = SimplifyInsertElementInst(IE->getOperand(0), IE->getOperand(1), IE->getOperand(2), Q); break; } case Instruction::ExtractValue: { auto *EVI = cast(I); Result = SimplifyExtractValueInst(EVI->getAggregateOperand(), EVI->getIndices(), Q); break; } case Instruction::ExtractElement: { auto *EEI = cast(I); Result = SimplifyExtractElementInst(EEI->getVectorOperand(), EEI->getIndexOperand(), Q); break; } case Instruction::ShuffleVector: { auto *SVI = cast(I); Result = SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1), SVI->getMask(), SVI->getType(), Q); break; } case Instruction::PHI: Result = SimplifyPHINode(cast(I), Q); break; case Instruction::Call: { Result = SimplifyCall(cast(I), Q); break; } #define HANDLE_CAST_INST(num, opc, clas) case Instruction::opc: #include "llvm/IR/Instruction.def" #undef HANDLE_CAST_INST Result = SimplifyCastInst(I->getOpcode(), I->getOperand(0), I->getType(), Q); break; case Instruction::Alloca: // No simplifications for Alloca and it can't be constant folded. Result = nullptr; break; } // In general, it is possible for computeKnownBits to determine all bits in a // value even when the operands are not all constants. if (!Result && I->getType()->isIntOrIntVectorTy()) { KnownBits Known = computeKnownBits(I, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE); if (Known.isConstant()) Result = ConstantInt::get(I->getType(), Known.getConstant()); } /// If called on unreachable code, the above logic may report that the /// instruction simplified to itself. Make life easier for users by /// detecting that case here, returning a safe value instead. return Result == I ? UndefValue::get(I->getType()) : Result; } /// Implementation of recursive simplification through an instruction's /// uses. /// /// This is the common implementation of the recursive simplification routines. /// If we have a pre-simplified value in 'SimpleV', that is forcibly used to /// replace the instruction 'I'. Otherwise, we simply add 'I' to the list of /// instructions to process and attempt to simplify it using -/// InstructionSimplify. +/// InstructionSimplify. Recursively visited users which could not be +/// simplified themselves are to the optional UnsimplifiedUsers set for +/// further processing by the caller. /// /// This routine returns 'true' only when *it* simplifies something. The passed /// in simplified value does not count toward this. -static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV, - const TargetLibraryInfo *TLI, - const DominatorTree *DT, - AssumptionCache *AC) { +static bool replaceAndRecursivelySimplifyImpl( + Instruction *I, Value *SimpleV, const TargetLibraryInfo *TLI, + const DominatorTree *DT, AssumptionCache *AC, + SmallSetVector *UnsimplifiedUsers = nullptr) { bool Simplified = false; SmallSetVector Worklist; const DataLayout &DL = I->getModule()->getDataLayout(); // If we have an explicit value to collapse to, do that round of the // simplification loop by hand initially. if (SimpleV) { for (User *U : I->users()) if (U != I) Worklist.insert(cast(U)); // Replace the instruction with its simplified value. I->replaceAllUsesWith(SimpleV); // Gracefully handle edge cases where the instruction is not wired into any // parent block. if (I->getParent() && !I->isEHPad() && !I->isTerminator() && !I->mayHaveSideEffects()) I->eraseFromParent(); } else { Worklist.insert(I); } // Note that we must test the size on each iteration, the worklist can grow. for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) { I = Worklist[Idx]; // See if this instruction simplifies. SimpleV = SimplifyInstruction(I, {DL, TLI, DT, AC}); - if (!SimpleV) + if (!SimpleV) { + if (UnsimplifiedUsers) + UnsimplifiedUsers->insert(I); continue; + } Simplified = true; // Stash away all the uses of the old instruction so we can check them for // recursive simplifications after a RAUW. This is cheaper than checking all // uses of To on the recursive step in most cases. for (User *U : I->users()) Worklist.insert(cast(U)); // Replace the instruction with its simplified value. I->replaceAllUsesWith(SimpleV); // Gracefully handle edge cases where the instruction is not wired into any // parent block. if (I->getParent() && !I->isEHPad() && !I->isTerminator() && !I->mayHaveSideEffects()) I->eraseFromParent(); } return Simplified; } bool llvm::recursivelySimplifyInstruction(Instruction *I, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC) { - return replaceAndRecursivelySimplifyImpl(I, nullptr, TLI, DT, AC); + return replaceAndRecursivelySimplifyImpl(I, nullptr, TLI, DT, AC, nullptr); } -bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV, - const TargetLibraryInfo *TLI, - const DominatorTree *DT, - AssumptionCache *AC) { +bool llvm::replaceAndRecursivelySimplify( + Instruction *I, Value *SimpleV, const TargetLibraryInfo *TLI, + const DominatorTree *DT, AssumptionCache *AC, + SmallSetVector *UnsimplifiedUsers) { assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!"); assert(SimpleV && "Must provide a simplified value."); - return replaceAndRecursivelySimplifyImpl(I, SimpleV, TLI, DT, AC); + return replaceAndRecursivelySimplifyImpl(I, SimpleV, TLI, DT, AC, + UnsimplifiedUsers); } namespace llvm { const SimplifyQuery getBestSimplifyQuery(Pass &P, Function &F) { auto *DTWP = P.getAnalysisIfAvailable(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *TLIWP = P.getAnalysisIfAvailable(); auto *TLI = TLIWP ? &TLIWP->getTLI() : nullptr; auto *ACWP = P.getAnalysisIfAvailable(); auto *AC = ACWP ? &ACWP->getAssumptionCache(F) : nullptr; return {F.getParent()->getDataLayout(), TLI, DT, AC}; } const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &AR, const DataLayout &DL) { return {DL, &AR.TLI, &AR.DT, &AR.AC}; } template const SimplifyQuery getBestSimplifyQuery(AnalysisManager &AM, Function &F) { auto *DT = AM.template getCachedResult(F); auto *TLI = AM.template getCachedResult(F); auto *AC = AM.template getCachedResult(F); return {F.getParent()->getDataLayout(), TLI, DT, AC}; } template const SimplifyQuery getBestSimplifyQuery(AnalysisManager &, Function &); } Index: vendor/llvm/dist-release_90/lib/IR/Core.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/IR/Core.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/IR/Core.cpp (revision 351709) @@ -1,4043 +1,4052 @@ //===-- Core.cpp ----------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the common infrastructure (including the C bindings) // for libLLVMCore.a, which implements the LLVM intermediate representation. // //===----------------------------------------------------------------------===// #include "llvm-c/Core.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Threading.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include using namespace llvm; #define DEBUG_TYPE "ir" void llvm::initializeCore(PassRegistry &Registry) { initializeDominatorTreeWrapperPassPass(Registry); initializePrintModulePassWrapperPass(Registry); initializePrintFunctionPassWrapperPass(Registry); initializePrintBasicBlockPassPass(Registry); initializeSafepointIRVerifierPass(Registry); initializeVerifierLegacyPassPass(Registry); } void LLVMInitializeCore(LLVMPassRegistryRef R) { initializeCore(*unwrap(R)); } void LLVMShutdown() { llvm_shutdown(); } /*===-- Error handling ----------------------------------------------------===*/ char *LLVMCreateMessage(const char *Message) { return strdup(Message); } void LLVMDisposeMessage(char *Message) { free(Message); } /*===-- Operations on contexts --------------------------------------------===*/ static ManagedStatic GlobalContext; LLVMContextRef LLVMContextCreate() { return wrap(new LLVMContext()); } LLVMContextRef LLVMGetGlobalContext() { return wrap(&*GlobalContext); } void LLVMContextSetDiagnosticHandler(LLVMContextRef C, LLVMDiagnosticHandler Handler, void *DiagnosticContext) { unwrap(C)->setDiagnosticHandlerCallBack( LLVM_EXTENSION reinterpret_cast( Handler), DiagnosticContext); } LLVMDiagnosticHandler LLVMContextGetDiagnosticHandler(LLVMContextRef C) { return LLVM_EXTENSION reinterpret_cast( unwrap(C)->getDiagnosticHandlerCallBack()); } void *LLVMContextGetDiagnosticContext(LLVMContextRef C) { return unwrap(C)->getDiagnosticContext(); } void LLVMContextSetYieldCallback(LLVMContextRef C, LLVMYieldCallback Callback, void *OpaqueHandle) { auto YieldCallback = LLVM_EXTENSION reinterpret_cast(Callback); unwrap(C)->setYieldCallback(YieldCallback, OpaqueHandle); } LLVMBool LLVMContextShouldDiscardValueNames(LLVMContextRef C) { return unwrap(C)->shouldDiscardValueNames(); } void LLVMContextSetDiscardValueNames(LLVMContextRef C, LLVMBool Discard) { unwrap(C)->setDiscardValueNames(Discard); } void LLVMContextDispose(LLVMContextRef C) { delete unwrap(C); } unsigned LLVMGetMDKindIDInContext(LLVMContextRef C, const char *Name, unsigned SLen) { return unwrap(C)->getMDKindID(StringRef(Name, SLen)); } unsigned LLVMGetMDKindID(const char *Name, unsigned SLen) { return LLVMGetMDKindIDInContext(LLVMGetGlobalContext(), Name, SLen); } #define GET_ATTR_KIND_FROM_NAME #include "AttributesCompatFunc.inc" unsigned LLVMGetEnumAttributeKindForName(const char *Name, size_t SLen) { return getAttrKindFromName(StringRef(Name, SLen)); } unsigned LLVMGetLastEnumAttributeKind(void) { return Attribute::AttrKind::EndAttrKinds; } LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID, uint64_t Val) { - return wrap(Attribute::get(*unwrap(C), (Attribute::AttrKind)KindID, Val)); + auto &Ctx = *unwrap(C); + auto AttrKind = (Attribute::AttrKind)KindID; + + if (AttrKind == Attribute::AttrKind::ByVal) { + // After r362128, byval attributes need to have a type attribute. Provide a + // NULL one until a proper API is added for this. + return wrap(Attribute::getWithByValType(Ctx, NULL)); + } else { + return wrap(Attribute::get(Ctx, AttrKind, Val)); + } } unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A) { return unwrap(A).getKindAsEnum(); } uint64_t LLVMGetEnumAttributeValue(LLVMAttributeRef A) { auto Attr = unwrap(A); if (Attr.isEnumAttribute()) return 0; return Attr.getValueAsInt(); } LLVMAttributeRef LLVMCreateStringAttribute(LLVMContextRef C, const char *K, unsigned KLength, const char *V, unsigned VLength) { return wrap(Attribute::get(*unwrap(C), StringRef(K, KLength), StringRef(V, VLength))); } const char *LLVMGetStringAttributeKind(LLVMAttributeRef A, unsigned *Length) { auto S = unwrap(A).getKindAsString(); *Length = S.size(); return S.data(); } const char *LLVMGetStringAttributeValue(LLVMAttributeRef A, unsigned *Length) { auto S = unwrap(A).getValueAsString(); *Length = S.size(); return S.data(); } LLVMBool LLVMIsEnumAttribute(LLVMAttributeRef A) { auto Attr = unwrap(A); return Attr.isEnumAttribute() || Attr.isIntAttribute(); } LLVMBool LLVMIsStringAttribute(LLVMAttributeRef A) { return unwrap(A).isStringAttribute(); } char *LLVMGetDiagInfoDescription(LLVMDiagnosticInfoRef DI) { std::string MsgStorage; raw_string_ostream Stream(MsgStorage); DiagnosticPrinterRawOStream DP(Stream); unwrap(DI)->print(DP); Stream.flush(); return LLVMCreateMessage(MsgStorage.c_str()); } LLVMDiagnosticSeverity LLVMGetDiagInfoSeverity(LLVMDiagnosticInfoRef DI) { LLVMDiagnosticSeverity severity; switch(unwrap(DI)->getSeverity()) { default: severity = LLVMDSError; break; case DS_Warning: severity = LLVMDSWarning; break; case DS_Remark: severity = LLVMDSRemark; break; case DS_Note: severity = LLVMDSNote; break; } return severity; } /*===-- Operations on modules ---------------------------------------------===*/ LLVMModuleRef LLVMModuleCreateWithName(const char *ModuleID) { return wrap(new Module(ModuleID, *GlobalContext)); } LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID, LLVMContextRef C) { return wrap(new Module(ModuleID, *unwrap(C))); } void LLVMDisposeModule(LLVMModuleRef M) { delete unwrap(M); } const char *LLVMGetModuleIdentifier(LLVMModuleRef M, size_t *Len) { auto &Str = unwrap(M)->getModuleIdentifier(); *Len = Str.length(); return Str.c_str(); } void LLVMSetModuleIdentifier(LLVMModuleRef M, const char *Ident, size_t Len) { unwrap(M)->setModuleIdentifier(StringRef(Ident, Len)); } const char *LLVMGetSourceFileName(LLVMModuleRef M, size_t *Len) { auto &Str = unwrap(M)->getSourceFileName(); *Len = Str.length(); return Str.c_str(); } void LLVMSetSourceFileName(LLVMModuleRef M, const char *Name, size_t Len) { unwrap(M)->setSourceFileName(StringRef(Name, Len)); } /*--.. Data layout .........................................................--*/ const char *LLVMGetDataLayoutStr(LLVMModuleRef M) { return unwrap(M)->getDataLayoutStr().c_str(); } const char *LLVMGetDataLayout(LLVMModuleRef M) { return LLVMGetDataLayoutStr(M); } void LLVMSetDataLayout(LLVMModuleRef M, const char *DataLayoutStr) { unwrap(M)->setDataLayout(DataLayoutStr); } /*--.. Target triple .......................................................--*/ const char * LLVMGetTarget(LLVMModuleRef M) { return unwrap(M)->getTargetTriple().c_str(); } void LLVMSetTarget(LLVMModuleRef M, const char *Triple) { unwrap(M)->setTargetTriple(Triple); } /*--.. Module flags ........................................................--*/ struct LLVMOpaqueModuleFlagEntry { LLVMModuleFlagBehavior Behavior; const char *Key; size_t KeyLen; LLVMMetadataRef Metadata; }; static Module::ModFlagBehavior map_to_llvmModFlagBehavior(LLVMModuleFlagBehavior Behavior) { switch (Behavior) { case LLVMModuleFlagBehaviorError: return Module::ModFlagBehavior::Error; case LLVMModuleFlagBehaviorWarning: return Module::ModFlagBehavior::Warning; case LLVMModuleFlagBehaviorRequire: return Module::ModFlagBehavior::Require; case LLVMModuleFlagBehaviorOverride: return Module::ModFlagBehavior::Override; case LLVMModuleFlagBehaviorAppend: return Module::ModFlagBehavior::Append; case LLVMModuleFlagBehaviorAppendUnique: return Module::ModFlagBehavior::AppendUnique; } llvm_unreachable("Unknown LLVMModuleFlagBehavior"); } static LLVMModuleFlagBehavior map_from_llvmModFlagBehavior(Module::ModFlagBehavior Behavior) { switch (Behavior) { case Module::ModFlagBehavior::Error: return LLVMModuleFlagBehaviorError; case Module::ModFlagBehavior::Warning: return LLVMModuleFlagBehaviorWarning; case Module::ModFlagBehavior::Require: return LLVMModuleFlagBehaviorRequire; case Module::ModFlagBehavior::Override: return LLVMModuleFlagBehaviorOverride; case Module::ModFlagBehavior::Append: return LLVMModuleFlagBehaviorAppend; case Module::ModFlagBehavior::AppendUnique: return LLVMModuleFlagBehaviorAppendUnique; default: llvm_unreachable("Unhandled Flag Behavior"); } } LLVMModuleFlagEntry *LLVMCopyModuleFlagsMetadata(LLVMModuleRef M, size_t *Len) { SmallVector MFEs; unwrap(M)->getModuleFlagsMetadata(MFEs); LLVMOpaqueModuleFlagEntry *Result = static_cast( safe_malloc(MFEs.size() * sizeof(LLVMOpaqueModuleFlagEntry))); for (unsigned i = 0; i < MFEs.size(); ++i) { const auto &ModuleFlag = MFEs[i]; Result[i].Behavior = map_from_llvmModFlagBehavior(ModuleFlag.Behavior); Result[i].Key = ModuleFlag.Key->getString().data(); Result[i].KeyLen = ModuleFlag.Key->getString().size(); Result[i].Metadata = wrap(ModuleFlag.Val); } *Len = MFEs.size(); return Result; } void LLVMDisposeModuleFlagsMetadata(LLVMModuleFlagEntry *Entries) { free(Entries); } LLVMModuleFlagBehavior LLVMModuleFlagEntriesGetFlagBehavior(LLVMModuleFlagEntry *Entries, unsigned Index) { LLVMOpaqueModuleFlagEntry MFE = static_cast(Entries[Index]); return MFE.Behavior; } const char *LLVMModuleFlagEntriesGetKey(LLVMModuleFlagEntry *Entries, unsigned Index, size_t *Len) { LLVMOpaqueModuleFlagEntry MFE = static_cast(Entries[Index]); *Len = MFE.KeyLen; return MFE.Key; } LLVMMetadataRef LLVMModuleFlagEntriesGetMetadata(LLVMModuleFlagEntry *Entries, unsigned Index) { LLVMOpaqueModuleFlagEntry MFE = static_cast(Entries[Index]); return MFE.Metadata; } LLVMMetadataRef LLVMGetModuleFlag(LLVMModuleRef M, const char *Key, size_t KeyLen) { return wrap(unwrap(M)->getModuleFlag({Key, KeyLen})); } void LLVMAddModuleFlag(LLVMModuleRef M, LLVMModuleFlagBehavior Behavior, const char *Key, size_t KeyLen, LLVMMetadataRef Val) { unwrap(M)->addModuleFlag(map_to_llvmModFlagBehavior(Behavior), {Key, KeyLen}, unwrap(Val)); } /*--.. Printing modules ....................................................--*/ void LLVMDumpModule(LLVMModuleRef M) { unwrap(M)->print(errs(), nullptr, /*ShouldPreserveUseListOrder=*/false, /*IsForDebug=*/true); } LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename, char **ErrorMessage) { std::error_code EC; raw_fd_ostream dest(Filename, EC, sys::fs::F_Text); if (EC) { *ErrorMessage = strdup(EC.message().c_str()); return true; } unwrap(M)->print(dest, nullptr); dest.close(); if (dest.has_error()) { std::string E = "Error printing to file: " + dest.error().message(); *ErrorMessage = strdup(E.c_str()); return true; } return false; } char *LLVMPrintModuleToString(LLVMModuleRef M) { std::string buf; raw_string_ostream os(buf); unwrap(M)->print(os, nullptr); os.flush(); return strdup(buf.c_str()); } /*--.. Operations on inline assembler ......................................--*/ void LLVMSetModuleInlineAsm2(LLVMModuleRef M, const char *Asm, size_t Len) { unwrap(M)->setModuleInlineAsm(StringRef(Asm, Len)); } void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) { unwrap(M)->setModuleInlineAsm(StringRef(Asm)); } void LLVMAppendModuleInlineAsm(LLVMModuleRef M, const char *Asm, size_t Len) { unwrap(M)->appendModuleInlineAsm(StringRef(Asm, Len)); } const char *LLVMGetModuleInlineAsm(LLVMModuleRef M, size_t *Len) { auto &Str = unwrap(M)->getModuleInlineAsm(); *Len = Str.length(); return Str.c_str(); } LLVMValueRef LLVMGetInlineAsm(LLVMTypeRef Ty, char *AsmString, size_t AsmStringSize, char *Constraints, size_t ConstraintsSize, LLVMBool HasSideEffects, LLVMBool IsAlignStack, LLVMInlineAsmDialect Dialect) { InlineAsm::AsmDialect AD; switch (Dialect) { case LLVMInlineAsmDialectATT: AD = InlineAsm::AD_ATT; break; case LLVMInlineAsmDialectIntel: AD = InlineAsm::AD_Intel; break; } return wrap(InlineAsm::get(unwrap(Ty), StringRef(AsmString, AsmStringSize), StringRef(Constraints, ConstraintsSize), HasSideEffects, IsAlignStack, AD)); } /*--.. Operations on module contexts ......................................--*/ LLVMContextRef LLVMGetModuleContext(LLVMModuleRef M) { return wrap(&unwrap(M)->getContext()); } /*===-- Operations on types -----------------------------------------------===*/ /*--.. Operations on all types (mostly) ....................................--*/ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) { switch (unwrap(Ty)->getTypeID()) { case Type::VoidTyID: return LLVMVoidTypeKind; case Type::HalfTyID: return LLVMHalfTypeKind; case Type::FloatTyID: return LLVMFloatTypeKind; case Type::DoubleTyID: return LLVMDoubleTypeKind; case Type::X86_FP80TyID: return LLVMX86_FP80TypeKind; case Type::FP128TyID: return LLVMFP128TypeKind; case Type::PPC_FP128TyID: return LLVMPPC_FP128TypeKind; case Type::LabelTyID: return LLVMLabelTypeKind; case Type::MetadataTyID: return LLVMMetadataTypeKind; case Type::IntegerTyID: return LLVMIntegerTypeKind; case Type::FunctionTyID: return LLVMFunctionTypeKind; case Type::StructTyID: return LLVMStructTypeKind; case Type::ArrayTyID: return LLVMArrayTypeKind; case Type::PointerTyID: return LLVMPointerTypeKind; case Type::VectorTyID: return LLVMVectorTypeKind; case Type::X86_MMXTyID: return LLVMX86_MMXTypeKind; case Type::TokenTyID: return LLVMTokenTypeKind; } llvm_unreachable("Unhandled TypeID."); } LLVMBool LLVMTypeIsSized(LLVMTypeRef Ty) { return unwrap(Ty)->isSized(); } LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) { return wrap(&unwrap(Ty)->getContext()); } void LLVMDumpType(LLVMTypeRef Ty) { return unwrap(Ty)->print(errs(), /*IsForDebug=*/true); } char *LLVMPrintTypeToString(LLVMTypeRef Ty) { std::string buf; raw_string_ostream os(buf); if (unwrap(Ty)) unwrap(Ty)->print(os); else os << "Printing Type"; os.flush(); return strdup(buf.c_str()); } /*--.. Operations on integer types .........................................--*/ LLVMTypeRef LLVMInt1TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getInt1Ty(*unwrap(C)); } LLVMTypeRef LLVMInt8TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getInt8Ty(*unwrap(C)); } LLVMTypeRef LLVMInt16TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getInt16Ty(*unwrap(C)); } LLVMTypeRef LLVMInt32TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getInt32Ty(*unwrap(C)); } LLVMTypeRef LLVMInt64TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getInt64Ty(*unwrap(C)); } LLVMTypeRef LLVMInt128TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getInt128Ty(*unwrap(C)); } LLVMTypeRef LLVMIntTypeInContext(LLVMContextRef C, unsigned NumBits) { return wrap(IntegerType::get(*unwrap(C), NumBits)); } LLVMTypeRef LLVMInt1Type(void) { return LLVMInt1TypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMInt8Type(void) { return LLVMInt8TypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMInt16Type(void) { return LLVMInt16TypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMInt32Type(void) { return LLVMInt32TypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMInt64Type(void) { return LLVMInt64TypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMInt128Type(void) { return LLVMInt128TypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMIntType(unsigned NumBits) { return LLVMIntTypeInContext(LLVMGetGlobalContext(), NumBits); } unsigned LLVMGetIntTypeWidth(LLVMTypeRef IntegerTy) { return unwrap(IntegerTy)->getBitWidth(); } /*--.. Operations on real types ............................................--*/ LLVMTypeRef LLVMHalfTypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getHalfTy(*unwrap(C)); } LLVMTypeRef LLVMFloatTypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getFloatTy(*unwrap(C)); } LLVMTypeRef LLVMDoubleTypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getDoubleTy(*unwrap(C)); } LLVMTypeRef LLVMX86FP80TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getX86_FP80Ty(*unwrap(C)); } LLVMTypeRef LLVMFP128TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getFP128Ty(*unwrap(C)); } LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getPPC_FP128Ty(*unwrap(C)); } LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C)); } LLVMTypeRef LLVMHalfType(void) { return LLVMHalfTypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMFloatType(void) { return LLVMFloatTypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMDoubleType(void) { return LLVMDoubleTypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMX86FP80Type(void) { return LLVMX86FP80TypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMFP128Type(void) { return LLVMFP128TypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMPPCFP128Type(void) { return LLVMPPCFP128TypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMX86MMXType(void) { return LLVMX86MMXTypeInContext(LLVMGetGlobalContext()); } /*--.. Operations on function types ........................................--*/ LLVMTypeRef LLVMFunctionType(LLVMTypeRef ReturnType, LLVMTypeRef *ParamTypes, unsigned ParamCount, LLVMBool IsVarArg) { ArrayRef Tys(unwrap(ParamTypes), ParamCount); return wrap(FunctionType::get(unwrap(ReturnType), Tys, IsVarArg != 0)); } LLVMBool LLVMIsFunctionVarArg(LLVMTypeRef FunctionTy) { return unwrap(FunctionTy)->isVarArg(); } LLVMTypeRef LLVMGetReturnType(LLVMTypeRef FunctionTy) { return wrap(unwrap(FunctionTy)->getReturnType()); } unsigned LLVMCountParamTypes(LLVMTypeRef FunctionTy) { return unwrap(FunctionTy)->getNumParams(); } void LLVMGetParamTypes(LLVMTypeRef FunctionTy, LLVMTypeRef *Dest) { FunctionType *Ty = unwrap(FunctionTy); for (FunctionType::param_iterator I = Ty->param_begin(), E = Ty->param_end(); I != E; ++I) *Dest++ = wrap(*I); } /*--.. Operations on struct types ..........................................--*/ LLVMTypeRef LLVMStructTypeInContext(LLVMContextRef C, LLVMTypeRef *ElementTypes, unsigned ElementCount, LLVMBool Packed) { ArrayRef Tys(unwrap(ElementTypes), ElementCount); return wrap(StructType::get(*unwrap(C), Tys, Packed != 0)); } LLVMTypeRef LLVMStructType(LLVMTypeRef *ElementTypes, unsigned ElementCount, LLVMBool Packed) { return LLVMStructTypeInContext(LLVMGetGlobalContext(), ElementTypes, ElementCount, Packed); } LLVMTypeRef LLVMStructCreateNamed(LLVMContextRef C, const char *Name) { return wrap(StructType::create(*unwrap(C), Name)); } const char *LLVMGetStructName(LLVMTypeRef Ty) { StructType *Type = unwrap(Ty); if (!Type->hasName()) return nullptr; return Type->getName().data(); } void LLVMStructSetBody(LLVMTypeRef StructTy, LLVMTypeRef *ElementTypes, unsigned ElementCount, LLVMBool Packed) { ArrayRef Tys(unwrap(ElementTypes), ElementCount); unwrap(StructTy)->setBody(Tys, Packed != 0); } unsigned LLVMCountStructElementTypes(LLVMTypeRef StructTy) { return unwrap(StructTy)->getNumElements(); } void LLVMGetStructElementTypes(LLVMTypeRef StructTy, LLVMTypeRef *Dest) { StructType *Ty = unwrap(StructTy); for (StructType::element_iterator I = Ty->element_begin(), E = Ty->element_end(); I != E; ++I) *Dest++ = wrap(*I); } LLVMTypeRef LLVMStructGetTypeAtIndex(LLVMTypeRef StructTy, unsigned i) { StructType *Ty = unwrap(StructTy); return wrap(Ty->getTypeAtIndex(i)); } LLVMBool LLVMIsPackedStruct(LLVMTypeRef StructTy) { return unwrap(StructTy)->isPacked(); } LLVMBool LLVMIsOpaqueStruct(LLVMTypeRef StructTy) { return unwrap(StructTy)->isOpaque(); } LLVMBool LLVMIsLiteralStruct(LLVMTypeRef StructTy) { return unwrap(StructTy)->isLiteral(); } LLVMTypeRef LLVMGetTypeByName(LLVMModuleRef M, const char *Name) { return wrap(unwrap(M)->getTypeByName(Name)); } /*--.. Operations on array, pointer, and vector types (sequence types) .....--*/ void LLVMGetSubtypes(LLVMTypeRef Tp, LLVMTypeRef *Arr) { int i = 0; for (auto *T : unwrap(Tp)->subtypes()) { Arr[i] = wrap(T); i++; } } LLVMTypeRef LLVMArrayType(LLVMTypeRef ElementType, unsigned ElementCount) { return wrap(ArrayType::get(unwrap(ElementType), ElementCount)); } LLVMTypeRef LLVMPointerType(LLVMTypeRef ElementType, unsigned AddressSpace) { return wrap(PointerType::get(unwrap(ElementType), AddressSpace)); } LLVMTypeRef LLVMVectorType(LLVMTypeRef ElementType, unsigned ElementCount) { return wrap(VectorType::get(unwrap(ElementType), ElementCount)); } LLVMTypeRef LLVMGetElementType(LLVMTypeRef WrappedTy) { auto *Ty = unwrap(WrappedTy); if (auto *PTy = dyn_cast(Ty)) return wrap(PTy->getElementType()); return wrap(cast(Ty)->getElementType()); } unsigned LLVMGetNumContainedTypes(LLVMTypeRef Tp) { return unwrap(Tp)->getNumContainedTypes(); } unsigned LLVMGetArrayLength(LLVMTypeRef ArrayTy) { return unwrap(ArrayTy)->getNumElements(); } unsigned LLVMGetPointerAddressSpace(LLVMTypeRef PointerTy) { return unwrap(PointerTy)->getAddressSpace(); } unsigned LLVMGetVectorSize(LLVMTypeRef VectorTy) { return unwrap(VectorTy)->getNumElements(); } /*--.. Operations on other types ...........................................--*/ LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C) { return wrap(Type::getVoidTy(*unwrap(C))); } LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C) { return wrap(Type::getLabelTy(*unwrap(C))); } LLVMTypeRef LLVMTokenTypeInContext(LLVMContextRef C) { return wrap(Type::getTokenTy(*unwrap(C))); } LLVMTypeRef LLVMMetadataTypeInContext(LLVMContextRef C) { return wrap(Type::getMetadataTy(*unwrap(C))); } LLVMTypeRef LLVMVoidType(void) { return LLVMVoidTypeInContext(LLVMGetGlobalContext()); } LLVMTypeRef LLVMLabelType(void) { return LLVMLabelTypeInContext(LLVMGetGlobalContext()); } /*===-- Operations on values ----------------------------------------------===*/ /*--.. Operations on all values ............................................--*/ LLVMTypeRef LLVMTypeOf(LLVMValueRef Val) { return wrap(unwrap(Val)->getType()); } LLVMValueKind LLVMGetValueKind(LLVMValueRef Val) { switch(unwrap(Val)->getValueID()) { #define HANDLE_VALUE(Name) \ case Value::Name##Val: \ return LLVM##Name##ValueKind; #include "llvm/IR/Value.def" default: return LLVMInstructionValueKind; } } const char *LLVMGetValueName2(LLVMValueRef Val, size_t *Length) { auto *V = unwrap(Val); *Length = V->getName().size(); return V->getName().data(); } void LLVMSetValueName2(LLVMValueRef Val, const char *Name, size_t NameLen) { unwrap(Val)->setName(StringRef(Name, NameLen)); } const char *LLVMGetValueName(LLVMValueRef Val) { return unwrap(Val)->getName().data(); } void LLVMSetValueName(LLVMValueRef Val, const char *Name) { unwrap(Val)->setName(Name); } void LLVMDumpValue(LLVMValueRef Val) { unwrap(Val)->print(errs(), /*IsForDebug=*/true); } char* LLVMPrintValueToString(LLVMValueRef Val) { std::string buf; raw_string_ostream os(buf); if (unwrap(Val)) unwrap(Val)->print(os); else os << "Printing Value"; os.flush(); return strdup(buf.c_str()); } void LLVMReplaceAllUsesWith(LLVMValueRef OldVal, LLVMValueRef NewVal) { unwrap(OldVal)->replaceAllUsesWith(unwrap(NewVal)); } int LLVMHasMetadata(LLVMValueRef Inst) { return unwrap(Inst)->hasMetadata(); } LLVMValueRef LLVMGetMetadata(LLVMValueRef Inst, unsigned KindID) { auto *I = unwrap(Inst); assert(I && "Expected instruction"); if (auto *MD = I->getMetadata(KindID)) return wrap(MetadataAsValue::get(I->getContext(), MD)); return nullptr; } // MetadataAsValue uses a canonical format which strips the actual MDNode for // MDNode with just a single constant value, storing just a ConstantAsMetadata // This undoes this canonicalization, reconstructing the MDNode. static MDNode *extractMDNode(MetadataAsValue *MAV) { Metadata *MD = MAV->getMetadata(); assert((isa(MD) || isa(MD)) && "Expected a metadata node or a canonicalized constant"); if (MDNode *N = dyn_cast(MD)) return N; return MDNode::get(MAV->getContext(), MD); } void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef Val) { MDNode *N = Val ? extractMDNode(unwrap(Val)) : nullptr; unwrap(Inst)->setMetadata(KindID, N); } struct LLVMOpaqueValueMetadataEntry { unsigned Kind; LLVMMetadataRef Metadata; }; using MetadataEntries = SmallVectorImpl>; static LLVMValueMetadataEntry * llvm_getMetadata(size_t *NumEntries, llvm::function_ref AccessMD) { SmallVector, 8> MVEs; AccessMD(MVEs); LLVMOpaqueValueMetadataEntry *Result = static_cast( safe_malloc(MVEs.size() * sizeof(LLVMOpaqueValueMetadataEntry))); for (unsigned i = 0; i < MVEs.size(); ++i) { const auto &ModuleFlag = MVEs[i]; Result[i].Kind = ModuleFlag.first; Result[i].Metadata = wrap(ModuleFlag.second); } *NumEntries = MVEs.size(); return Result; } LLVMValueMetadataEntry * LLVMInstructionGetAllMetadataOtherThanDebugLoc(LLVMValueRef Value, size_t *NumEntries) { return llvm_getMetadata(NumEntries, [&Value](MetadataEntries &Entries) { unwrap(Value)->getAllMetadata(Entries); }); } /*--.. Conversion functions ................................................--*/ #define LLVM_DEFINE_VALUE_CAST(name) \ LLVMValueRef LLVMIsA##name(LLVMValueRef Val) { \ return wrap(static_cast(dyn_cast_or_null(unwrap(Val)))); \ } LLVM_FOR_EACH_VALUE_SUBCLASS(LLVM_DEFINE_VALUE_CAST) LLVMValueRef LLVMIsAMDNode(LLVMValueRef Val) { if (auto *MD = dyn_cast_or_null(unwrap(Val))) if (isa(MD->getMetadata()) || isa(MD->getMetadata())) return Val; return nullptr; } LLVMValueRef LLVMIsAMDString(LLVMValueRef Val) { if (auto *MD = dyn_cast_or_null(unwrap(Val))) if (isa(MD->getMetadata())) return Val; return nullptr; } /*--.. Operations on Uses ..................................................--*/ LLVMUseRef LLVMGetFirstUse(LLVMValueRef Val) { Value *V = unwrap(Val); Value::use_iterator I = V->use_begin(); if (I == V->use_end()) return nullptr; return wrap(&*I); } LLVMUseRef LLVMGetNextUse(LLVMUseRef U) { Use *Next = unwrap(U)->getNext(); if (Next) return wrap(Next); return nullptr; } LLVMValueRef LLVMGetUser(LLVMUseRef U) { return wrap(unwrap(U)->getUser()); } LLVMValueRef LLVMGetUsedValue(LLVMUseRef U) { return wrap(unwrap(U)->get()); } /*--.. Operations on Users .................................................--*/ static LLVMValueRef getMDNodeOperandImpl(LLVMContext &Context, const MDNode *N, unsigned Index) { Metadata *Op = N->getOperand(Index); if (!Op) return nullptr; if (auto *C = dyn_cast(Op)) return wrap(C->getValue()); return wrap(MetadataAsValue::get(Context, Op)); } LLVMValueRef LLVMGetOperand(LLVMValueRef Val, unsigned Index) { Value *V = unwrap(Val); if (auto *MD = dyn_cast(V)) { if (auto *L = dyn_cast(MD->getMetadata())) { assert(Index == 0 && "Function-local metadata can only have one operand"); return wrap(L->getValue()); } return getMDNodeOperandImpl(V->getContext(), cast(MD->getMetadata()), Index); } return wrap(cast(V)->getOperand(Index)); } LLVMUseRef LLVMGetOperandUse(LLVMValueRef Val, unsigned Index) { Value *V = unwrap(Val); return wrap(&cast(V)->getOperandUse(Index)); } void LLVMSetOperand(LLVMValueRef Val, unsigned Index, LLVMValueRef Op) { unwrap(Val)->setOperand(Index, unwrap(Op)); } int LLVMGetNumOperands(LLVMValueRef Val) { Value *V = unwrap(Val); if (isa(V)) return LLVMGetMDNodeNumOperands(Val); return cast(V)->getNumOperands(); } /*--.. Operations on constants of any type .................................--*/ LLVMValueRef LLVMConstNull(LLVMTypeRef Ty) { return wrap(Constant::getNullValue(unwrap(Ty))); } LLVMValueRef LLVMConstAllOnes(LLVMTypeRef Ty) { return wrap(Constant::getAllOnesValue(unwrap(Ty))); } LLVMValueRef LLVMGetUndef(LLVMTypeRef Ty) { return wrap(UndefValue::get(unwrap(Ty))); } LLVMBool LLVMIsConstant(LLVMValueRef Ty) { return isa(unwrap(Ty)); } LLVMBool LLVMIsNull(LLVMValueRef Val) { if (Constant *C = dyn_cast(unwrap(Val))) return C->isNullValue(); return false; } LLVMBool LLVMIsUndef(LLVMValueRef Val) { return isa(unwrap(Val)); } LLVMValueRef LLVMConstPointerNull(LLVMTypeRef Ty) { return wrap(ConstantPointerNull::get(unwrap(Ty))); } /*--.. Operations on metadata nodes ........................................--*/ LLVMMetadataRef LLVMMDStringInContext2(LLVMContextRef C, const char *Str, size_t SLen) { return wrap(MDString::get(*unwrap(C), StringRef(Str, SLen))); } LLVMMetadataRef LLVMMDNodeInContext2(LLVMContextRef C, LLVMMetadataRef *MDs, size_t Count) { return wrap(MDNode::get(*unwrap(C), ArrayRef(unwrap(MDs), Count))); } LLVMValueRef LLVMMDStringInContext(LLVMContextRef C, const char *Str, unsigned SLen) { LLVMContext &Context = *unwrap(C); return wrap(MetadataAsValue::get( Context, MDString::get(Context, StringRef(Str, SLen)))); } LLVMValueRef LLVMMDString(const char *Str, unsigned SLen) { return LLVMMDStringInContext(LLVMGetGlobalContext(), Str, SLen); } LLVMValueRef LLVMMDNodeInContext(LLVMContextRef C, LLVMValueRef *Vals, unsigned Count) { LLVMContext &Context = *unwrap(C); SmallVector MDs; for (auto *OV : makeArrayRef(Vals, Count)) { Value *V = unwrap(OV); Metadata *MD; if (!V) MD = nullptr; else if (auto *C = dyn_cast(V)) MD = ConstantAsMetadata::get(C); else if (auto *MDV = dyn_cast(V)) { MD = MDV->getMetadata(); assert(!isa(MD) && "Unexpected function-local metadata " "outside of direct argument to call"); } else { // This is function-local metadata. Pretend to make an MDNode. assert(Count == 1 && "Expected only one operand to function-local metadata"); return wrap(MetadataAsValue::get(Context, LocalAsMetadata::get(V))); } MDs.push_back(MD); } return wrap(MetadataAsValue::get(Context, MDNode::get(Context, MDs))); } LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) { return LLVMMDNodeInContext(LLVMGetGlobalContext(), Vals, Count); } LLVMValueRef LLVMMetadataAsValue(LLVMContextRef C, LLVMMetadataRef MD) { return wrap(MetadataAsValue::get(*unwrap(C), unwrap(MD))); } LLVMMetadataRef LLVMValueAsMetadata(LLVMValueRef Val) { auto *V = unwrap(Val); if (auto *C = dyn_cast(V)) return wrap(ConstantAsMetadata::get(C)); if (auto *MAV = dyn_cast(V)) return wrap(MAV->getMetadata()); return wrap(ValueAsMetadata::get(V)); } const char *LLVMGetMDString(LLVMValueRef V, unsigned *Length) { if (const auto *MD = dyn_cast(unwrap(V))) if (const MDString *S = dyn_cast(MD->getMetadata())) { *Length = S->getString().size(); return S->getString().data(); } *Length = 0; return nullptr; } unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V) { auto *MD = cast(unwrap(V)); if (isa(MD->getMetadata())) return 1; return cast(MD->getMetadata())->getNumOperands(); } LLVMNamedMDNodeRef LLVMGetFirstNamedMetadata(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::named_metadata_iterator I = Mod->named_metadata_begin(); if (I == Mod->named_metadata_end()) return nullptr; return wrap(&*I); } LLVMNamedMDNodeRef LLVMGetLastNamedMetadata(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::named_metadata_iterator I = Mod->named_metadata_end(); if (I == Mod->named_metadata_begin()) return nullptr; return wrap(&*--I); } LLVMNamedMDNodeRef LLVMGetNextNamedMetadata(LLVMNamedMDNodeRef NMD) { NamedMDNode *NamedNode = unwrap(NMD); Module::named_metadata_iterator I(NamedNode); if (++I == NamedNode->getParent()->named_metadata_end()) return nullptr; return wrap(&*I); } LLVMNamedMDNodeRef LLVMGetPreviousNamedMetadata(LLVMNamedMDNodeRef NMD) { NamedMDNode *NamedNode = unwrap(NMD); Module::named_metadata_iterator I(NamedNode); if (I == NamedNode->getParent()->named_metadata_begin()) return nullptr; return wrap(&*--I); } LLVMNamedMDNodeRef LLVMGetNamedMetadata(LLVMModuleRef M, const char *Name, size_t NameLen) { return wrap(unwrap(M)->getNamedMetadata(StringRef(Name, NameLen))); } LLVMNamedMDNodeRef LLVMGetOrInsertNamedMetadata(LLVMModuleRef M, const char *Name, size_t NameLen) { return wrap(unwrap(M)->getOrInsertNamedMetadata({Name, NameLen})); } const char *LLVMGetNamedMetadataName(LLVMNamedMDNodeRef NMD, size_t *NameLen) { NamedMDNode *NamedNode = unwrap(NMD); *NameLen = NamedNode->getName().size(); return NamedNode->getName().data(); } void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest) { auto *MD = cast(unwrap(V)); if (auto *MDV = dyn_cast(MD->getMetadata())) { *Dest = wrap(MDV->getValue()); return; } const auto *N = cast(MD->getMetadata()); const unsigned numOperands = N->getNumOperands(); LLVMContext &Context = unwrap(V)->getContext(); for (unsigned i = 0; i < numOperands; i++) Dest[i] = getMDNodeOperandImpl(Context, N, i); } unsigned LLVMGetNamedMetadataNumOperands(LLVMModuleRef M, const char *Name) { if (NamedMDNode *N = unwrap(M)->getNamedMetadata(Name)) { return N->getNumOperands(); } return 0; } void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char *Name, LLVMValueRef *Dest) { NamedMDNode *N = unwrap(M)->getNamedMetadata(Name); if (!N) return; LLVMContext &Context = unwrap(M)->getContext(); for (unsigned i=0;igetNumOperands();i++) Dest[i] = wrap(MetadataAsValue::get(Context, N->getOperand(i))); } void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char *Name, LLVMValueRef Val) { NamedMDNode *N = unwrap(M)->getOrInsertNamedMetadata(Name); if (!N) return; if (!Val) return; N->addOperand(extractMDNode(unwrap(Val))); } const char *LLVMGetDebugLocDirectory(LLVMValueRef Val, unsigned *Length) { if (!Length) return nullptr; StringRef S; if (const auto *I = dyn_cast(unwrap(Val))) { if (const auto &DL = I->getDebugLoc()) { S = DL->getDirectory(); } } else if (const auto *GV = dyn_cast(unwrap(Val))) { SmallVector GVEs; GV->getDebugInfo(GVEs); if (GVEs.size()) if (const DIGlobalVariable *DGV = GVEs[0]->getVariable()) S = DGV->getDirectory(); } else if (const auto *F = dyn_cast(unwrap(Val))) { if (const DISubprogram *DSP = F->getSubprogram()) S = DSP->getDirectory(); } else { assert(0 && "Expected Instruction, GlobalVariable or Function"); return nullptr; } *Length = S.size(); return S.data(); } const char *LLVMGetDebugLocFilename(LLVMValueRef Val, unsigned *Length) { if (!Length) return nullptr; StringRef S; if (const auto *I = dyn_cast(unwrap(Val))) { if (const auto &DL = I->getDebugLoc()) { S = DL->getFilename(); } } else if (const auto *GV = dyn_cast(unwrap(Val))) { SmallVector GVEs; GV->getDebugInfo(GVEs); if (GVEs.size()) if (const DIGlobalVariable *DGV = GVEs[0]->getVariable()) S = DGV->getFilename(); } else if (const auto *F = dyn_cast(unwrap(Val))) { if (const DISubprogram *DSP = F->getSubprogram()) S = DSP->getFilename(); } else { assert(0 && "Expected Instruction, GlobalVariable or Function"); return nullptr; } *Length = S.size(); return S.data(); } unsigned LLVMGetDebugLocLine(LLVMValueRef Val) { unsigned L = 0; if (const auto *I = dyn_cast(unwrap(Val))) { if (const auto &DL = I->getDebugLoc()) { L = DL->getLine(); } } else if (const auto *GV = dyn_cast(unwrap(Val))) { SmallVector GVEs; GV->getDebugInfo(GVEs); if (GVEs.size()) if (const DIGlobalVariable *DGV = GVEs[0]->getVariable()) L = DGV->getLine(); } else if (const auto *F = dyn_cast(unwrap(Val))) { if (const DISubprogram *DSP = F->getSubprogram()) L = DSP->getLine(); } else { assert(0 && "Expected Instruction, GlobalVariable or Function"); return -1; } return L; } unsigned LLVMGetDebugLocColumn(LLVMValueRef Val) { unsigned C = 0; if (const auto *I = dyn_cast(unwrap(Val))) if (const auto &DL = I->getDebugLoc()) C = DL->getColumn(); return C; } /*--.. Operations on scalar constants ......................................--*/ LLVMValueRef LLVMConstInt(LLVMTypeRef IntTy, unsigned long long N, LLVMBool SignExtend) { return wrap(ConstantInt::get(unwrap(IntTy), N, SignExtend != 0)); } LLVMValueRef LLVMConstIntOfArbitraryPrecision(LLVMTypeRef IntTy, unsigned NumWords, const uint64_t Words[]) { IntegerType *Ty = unwrap(IntTy); return wrap(ConstantInt::get(Ty->getContext(), APInt(Ty->getBitWidth(), makeArrayRef(Words, NumWords)))); } LLVMValueRef LLVMConstIntOfString(LLVMTypeRef IntTy, const char Str[], uint8_t Radix) { return wrap(ConstantInt::get(unwrap(IntTy), StringRef(Str), Radix)); } LLVMValueRef LLVMConstIntOfStringAndSize(LLVMTypeRef IntTy, const char Str[], unsigned SLen, uint8_t Radix) { return wrap(ConstantInt::get(unwrap(IntTy), StringRef(Str, SLen), Radix)); } LLVMValueRef LLVMConstReal(LLVMTypeRef RealTy, double N) { return wrap(ConstantFP::get(unwrap(RealTy), N)); } LLVMValueRef LLVMConstRealOfString(LLVMTypeRef RealTy, const char *Text) { return wrap(ConstantFP::get(unwrap(RealTy), StringRef(Text))); } LLVMValueRef LLVMConstRealOfStringAndSize(LLVMTypeRef RealTy, const char Str[], unsigned SLen) { return wrap(ConstantFP::get(unwrap(RealTy), StringRef(Str, SLen))); } unsigned long long LLVMConstIntGetZExtValue(LLVMValueRef ConstantVal) { return unwrap(ConstantVal)->getZExtValue(); } long long LLVMConstIntGetSExtValue(LLVMValueRef ConstantVal) { return unwrap(ConstantVal)->getSExtValue(); } double LLVMConstRealGetDouble(LLVMValueRef ConstantVal, LLVMBool *LosesInfo) { ConstantFP *cFP = unwrap(ConstantVal) ; Type *Ty = cFP->getType(); if (Ty->isFloatTy()) { *LosesInfo = false; return cFP->getValueAPF().convertToFloat(); } if (Ty->isDoubleTy()) { *LosesInfo = false; return cFP->getValueAPF().convertToDouble(); } bool APFLosesInfo; APFloat APF = cFP->getValueAPF(); APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &APFLosesInfo); *LosesInfo = APFLosesInfo; return APF.convertToDouble(); } /*--.. Operations on composite constants ...................................--*/ LLVMValueRef LLVMConstStringInContext(LLVMContextRef C, const char *Str, unsigned Length, LLVMBool DontNullTerminate) { /* Inverted the sense of AddNull because ', 0)' is a better mnemonic for null termination than ', 1)'. */ return wrap(ConstantDataArray::getString(*unwrap(C), StringRef(Str, Length), DontNullTerminate == 0)); } LLVMValueRef LLVMConstString(const char *Str, unsigned Length, LLVMBool DontNullTerminate) { return LLVMConstStringInContext(LLVMGetGlobalContext(), Str, Length, DontNullTerminate); } LLVMValueRef LLVMGetElementAsConstant(LLVMValueRef C, unsigned idx) { return wrap(unwrap(C)->getElementAsConstant(idx)); } LLVMBool LLVMIsConstantString(LLVMValueRef C) { return unwrap(C)->isString(); } const char *LLVMGetAsString(LLVMValueRef C, size_t *Length) { StringRef Str = unwrap(C)->getAsString(); *Length = Str.size(); return Str.data(); } LLVMValueRef LLVMConstArray(LLVMTypeRef ElementTy, LLVMValueRef *ConstantVals, unsigned Length) { ArrayRef V(unwrap(ConstantVals, Length), Length); return wrap(ConstantArray::get(ArrayType::get(unwrap(ElementTy), Length), V)); } LLVMValueRef LLVMConstStructInContext(LLVMContextRef C, LLVMValueRef *ConstantVals, unsigned Count, LLVMBool Packed) { Constant **Elements = unwrap(ConstantVals, Count); return wrap(ConstantStruct::getAnon(*unwrap(C), makeArrayRef(Elements, Count), Packed != 0)); } LLVMValueRef LLVMConstStruct(LLVMValueRef *ConstantVals, unsigned Count, LLVMBool Packed) { return LLVMConstStructInContext(LLVMGetGlobalContext(), ConstantVals, Count, Packed); } LLVMValueRef LLVMConstNamedStruct(LLVMTypeRef StructTy, LLVMValueRef *ConstantVals, unsigned Count) { Constant **Elements = unwrap(ConstantVals, Count); StructType *Ty = cast(unwrap(StructTy)); return wrap(ConstantStruct::get(Ty, makeArrayRef(Elements, Count))); } LLVMValueRef LLVMConstVector(LLVMValueRef *ScalarConstantVals, unsigned Size) { return wrap(ConstantVector::get(makeArrayRef( unwrap(ScalarConstantVals, Size), Size))); } /*-- Opcode mapping */ static LLVMOpcode map_to_llvmopcode(int opcode) { switch (opcode) { default: llvm_unreachable("Unhandled Opcode."); #define HANDLE_INST(num, opc, clas) case num: return LLVM##opc; #include "llvm/IR/Instruction.def" #undef HANDLE_INST } } static int map_from_llvmopcode(LLVMOpcode code) { switch (code) { #define HANDLE_INST(num, opc, clas) case LLVM##opc: return num; #include "llvm/IR/Instruction.def" #undef HANDLE_INST } llvm_unreachable("Unhandled Opcode."); } /*--.. Constant expressions ................................................--*/ LLVMOpcode LLVMGetConstOpcode(LLVMValueRef ConstantVal) { return map_to_llvmopcode(unwrap(ConstantVal)->getOpcode()); } LLVMValueRef LLVMAlignOf(LLVMTypeRef Ty) { return wrap(ConstantExpr::getAlignOf(unwrap(Ty))); } LLVMValueRef LLVMSizeOf(LLVMTypeRef Ty) { return wrap(ConstantExpr::getSizeOf(unwrap(Ty))); } LLVMValueRef LLVMConstNeg(LLVMValueRef ConstantVal) { return wrap(ConstantExpr::getNeg(unwrap(ConstantVal))); } LLVMValueRef LLVMConstNSWNeg(LLVMValueRef ConstantVal) { return wrap(ConstantExpr::getNSWNeg(unwrap(ConstantVal))); } LLVMValueRef LLVMConstNUWNeg(LLVMValueRef ConstantVal) { return wrap(ConstantExpr::getNUWNeg(unwrap(ConstantVal))); } LLVMValueRef LLVMConstFNeg(LLVMValueRef ConstantVal) { return wrap(ConstantExpr::getFNeg(unwrap(ConstantVal))); } LLVMValueRef LLVMConstNot(LLVMValueRef ConstantVal) { return wrap(ConstantExpr::getNot(unwrap(ConstantVal))); } LLVMValueRef LLVMConstAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getAdd(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstNSWAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getNSWAdd(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstNUWAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getNUWAdd(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstFAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getFAdd(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getSub(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstNSWSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getNSWSub(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstNUWSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getNUWSub(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstFSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getFSub(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getMul(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstNSWMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getNSWMul(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstNUWMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getNUWMul(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstFMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getFMul(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstUDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getUDiv(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstExactUDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getExactUDiv(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstSDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getSDiv(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstExactSDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getExactSDiv(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstFDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getFDiv(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstURem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getURem(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstSRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getSRem(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstFRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getFRem(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstAnd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getAnd(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstOr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getOr(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstXor(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getXor(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstICmp(LLVMIntPredicate Predicate, LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getICmp(Predicate, unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstFCmp(LLVMRealPredicate Predicate, LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getFCmp(Predicate, unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstShl(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getShl(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstLShr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getLShr(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstAShr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) { return wrap(ConstantExpr::getAShr(unwrap(LHSConstant), unwrap(RHSConstant))); } LLVMValueRef LLVMConstGEP(LLVMValueRef ConstantVal, LLVMValueRef *ConstantIndices, unsigned NumIndices) { ArrayRef IdxList(unwrap(ConstantIndices, NumIndices), NumIndices); Constant *Val = unwrap(ConstantVal); Type *Ty = cast(Val->getType()->getScalarType())->getElementType(); return wrap(ConstantExpr::getGetElementPtr(Ty, Val, IdxList)); } LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal, LLVMValueRef *ConstantIndices, unsigned NumIndices) { ArrayRef IdxList(unwrap(ConstantIndices, NumIndices), NumIndices); Constant *Val = unwrap(ConstantVal); Type *Ty = cast(Val->getType()->getScalarType())->getElementType(); return wrap(ConstantExpr::getInBoundsGetElementPtr(Ty, Val, IdxList)); } LLVMValueRef LLVMConstTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getTrunc(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstSExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getSExt(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstZExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getZExt(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstFPTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getFPTrunc(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstFPExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getFPExtend(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstUIToFP(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getUIToFP(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstSIToFP(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getSIToFP(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstFPToUI(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getFPToUI(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstFPToSI(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getFPToSI(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstPtrToInt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getPtrToInt(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstIntToPtr(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getIntToPtr(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getBitCast(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstAddrSpaceCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getAddrSpaceCast(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstZExtOrBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getZExtOrBitCast(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstSExtOrBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getSExtOrBitCast(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstTruncOrBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getTruncOrBitCast(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstPointerCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getPointerCast(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstIntCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType, LLVMBool isSigned) { return wrap(ConstantExpr::getIntegerCast(unwrap(ConstantVal), unwrap(ToType), isSigned)); } LLVMValueRef LLVMConstFPCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getFPCast(unwrap(ConstantVal), unwrap(ToType))); } LLVMValueRef LLVMConstSelect(LLVMValueRef ConstantCondition, LLVMValueRef ConstantIfTrue, LLVMValueRef ConstantIfFalse) { return wrap(ConstantExpr::getSelect(unwrap(ConstantCondition), unwrap(ConstantIfTrue), unwrap(ConstantIfFalse))); } LLVMValueRef LLVMConstExtractElement(LLVMValueRef VectorConstant, LLVMValueRef IndexConstant) { return wrap(ConstantExpr::getExtractElement(unwrap(VectorConstant), unwrap(IndexConstant))); } LLVMValueRef LLVMConstInsertElement(LLVMValueRef VectorConstant, LLVMValueRef ElementValueConstant, LLVMValueRef IndexConstant) { return wrap(ConstantExpr::getInsertElement(unwrap(VectorConstant), unwrap(ElementValueConstant), unwrap(IndexConstant))); } LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant, LLVMValueRef VectorBConstant, LLVMValueRef MaskConstant) { return wrap(ConstantExpr::getShuffleVector(unwrap(VectorAConstant), unwrap(VectorBConstant), unwrap(MaskConstant))); } LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList, unsigned NumIdx) { return wrap(ConstantExpr::getExtractValue(unwrap(AggConstant), makeArrayRef(IdxList, NumIdx))); } LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant, LLVMValueRef ElementValueConstant, unsigned *IdxList, unsigned NumIdx) { return wrap(ConstantExpr::getInsertValue(unwrap(AggConstant), unwrap(ElementValueConstant), makeArrayRef(IdxList, NumIdx))); } LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty, const char *AsmString, const char *Constraints, LLVMBool HasSideEffects, LLVMBool IsAlignStack) { return wrap(InlineAsm::get(dyn_cast(unwrap(Ty)), AsmString, Constraints, HasSideEffects, IsAlignStack)); } LLVMValueRef LLVMBlockAddress(LLVMValueRef F, LLVMBasicBlockRef BB) { return wrap(BlockAddress::get(unwrap(F), unwrap(BB))); } /*--.. Operations on global variables, functions, and aliases (globals) ....--*/ LLVMModuleRef LLVMGetGlobalParent(LLVMValueRef Global) { return wrap(unwrap(Global)->getParent()); } LLVMBool LLVMIsDeclaration(LLVMValueRef Global) { return unwrap(Global)->isDeclaration(); } LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) { switch (unwrap(Global)->getLinkage()) { case GlobalValue::ExternalLinkage: return LLVMExternalLinkage; case GlobalValue::AvailableExternallyLinkage: return LLVMAvailableExternallyLinkage; case GlobalValue::LinkOnceAnyLinkage: return LLVMLinkOnceAnyLinkage; case GlobalValue::LinkOnceODRLinkage: return LLVMLinkOnceODRLinkage; case GlobalValue::WeakAnyLinkage: return LLVMWeakAnyLinkage; case GlobalValue::WeakODRLinkage: return LLVMWeakODRLinkage; case GlobalValue::AppendingLinkage: return LLVMAppendingLinkage; case GlobalValue::InternalLinkage: return LLVMInternalLinkage; case GlobalValue::PrivateLinkage: return LLVMPrivateLinkage; case GlobalValue::ExternalWeakLinkage: return LLVMExternalWeakLinkage; case GlobalValue::CommonLinkage: return LLVMCommonLinkage; } llvm_unreachable("Invalid GlobalValue linkage!"); } void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) { GlobalValue *GV = unwrap(Global); switch (Linkage) { case LLVMExternalLinkage: GV->setLinkage(GlobalValue::ExternalLinkage); break; case LLVMAvailableExternallyLinkage: GV->setLinkage(GlobalValue::AvailableExternallyLinkage); break; case LLVMLinkOnceAnyLinkage: GV->setLinkage(GlobalValue::LinkOnceAnyLinkage); break; case LLVMLinkOnceODRLinkage: GV->setLinkage(GlobalValue::LinkOnceODRLinkage); break; case LLVMLinkOnceODRAutoHideLinkage: LLVM_DEBUG( errs() << "LLVMSetLinkage(): LLVMLinkOnceODRAutoHideLinkage is no " "longer supported."); break; case LLVMWeakAnyLinkage: GV->setLinkage(GlobalValue::WeakAnyLinkage); break; case LLVMWeakODRLinkage: GV->setLinkage(GlobalValue::WeakODRLinkage); break; case LLVMAppendingLinkage: GV->setLinkage(GlobalValue::AppendingLinkage); break; case LLVMInternalLinkage: GV->setLinkage(GlobalValue::InternalLinkage); break; case LLVMPrivateLinkage: GV->setLinkage(GlobalValue::PrivateLinkage); break; case LLVMLinkerPrivateLinkage: GV->setLinkage(GlobalValue::PrivateLinkage); break; case LLVMLinkerPrivateWeakLinkage: GV->setLinkage(GlobalValue::PrivateLinkage); break; case LLVMDLLImportLinkage: LLVM_DEBUG( errs() << "LLVMSetLinkage(): LLVMDLLImportLinkage is no longer supported."); break; case LLVMDLLExportLinkage: LLVM_DEBUG( errs() << "LLVMSetLinkage(): LLVMDLLExportLinkage is no longer supported."); break; case LLVMExternalWeakLinkage: GV->setLinkage(GlobalValue::ExternalWeakLinkage); break; case LLVMGhostLinkage: LLVM_DEBUG( errs() << "LLVMSetLinkage(): LLVMGhostLinkage is no longer supported."); break; case LLVMCommonLinkage: GV->setLinkage(GlobalValue::CommonLinkage); break; } } const char *LLVMGetSection(LLVMValueRef Global) { // Using .data() is safe because of how GlobalObject::setSection is // implemented. return unwrap(Global)->getSection().data(); } void LLVMSetSection(LLVMValueRef Global, const char *Section) { unwrap(Global)->setSection(Section); } LLVMVisibility LLVMGetVisibility(LLVMValueRef Global) { return static_cast( unwrap(Global)->getVisibility()); } void LLVMSetVisibility(LLVMValueRef Global, LLVMVisibility Viz) { unwrap(Global) ->setVisibility(static_cast(Viz)); } LLVMDLLStorageClass LLVMGetDLLStorageClass(LLVMValueRef Global) { return static_cast( unwrap(Global)->getDLLStorageClass()); } void LLVMSetDLLStorageClass(LLVMValueRef Global, LLVMDLLStorageClass Class) { unwrap(Global)->setDLLStorageClass( static_cast(Class)); } LLVMUnnamedAddr LLVMGetUnnamedAddress(LLVMValueRef Global) { switch (unwrap(Global)->getUnnamedAddr()) { case GlobalVariable::UnnamedAddr::None: return LLVMNoUnnamedAddr; case GlobalVariable::UnnamedAddr::Local: return LLVMLocalUnnamedAddr; case GlobalVariable::UnnamedAddr::Global: return LLVMGlobalUnnamedAddr; } llvm_unreachable("Unknown UnnamedAddr kind!"); } void LLVMSetUnnamedAddress(LLVMValueRef Global, LLVMUnnamedAddr UnnamedAddr) { GlobalValue *GV = unwrap(Global); switch (UnnamedAddr) { case LLVMNoUnnamedAddr: return GV->setUnnamedAddr(GlobalVariable::UnnamedAddr::None); case LLVMLocalUnnamedAddr: return GV->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local); case LLVMGlobalUnnamedAddr: return GV->setUnnamedAddr(GlobalVariable::UnnamedAddr::Global); } } LLVMBool LLVMHasUnnamedAddr(LLVMValueRef Global) { return unwrap(Global)->hasGlobalUnnamedAddr(); } void LLVMSetUnnamedAddr(LLVMValueRef Global, LLVMBool HasUnnamedAddr) { unwrap(Global)->setUnnamedAddr( HasUnnamedAddr ? GlobalValue::UnnamedAddr::Global : GlobalValue::UnnamedAddr::None); } LLVMTypeRef LLVMGlobalGetValueType(LLVMValueRef Global) { return wrap(unwrap(Global)->getValueType()); } /*--.. Operations on global variables, load and store instructions .........--*/ unsigned LLVMGetAlignment(LLVMValueRef V) { Value *P = unwrap(V); if (GlobalValue *GV = dyn_cast(P)) return GV->getAlignment(); if (AllocaInst *AI = dyn_cast(P)) return AI->getAlignment(); if (LoadInst *LI = dyn_cast(P)) return LI->getAlignment(); if (StoreInst *SI = dyn_cast(P)) return SI->getAlignment(); llvm_unreachable( "only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment"); } void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) { Value *P = unwrap(V); if (GlobalObject *GV = dyn_cast(P)) GV->setAlignment(Bytes); else if (AllocaInst *AI = dyn_cast(P)) AI->setAlignment(Bytes); else if (LoadInst *LI = dyn_cast(P)) LI->setAlignment(Bytes); else if (StoreInst *SI = dyn_cast(P)) SI->setAlignment(Bytes); else llvm_unreachable( "only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment"); } LLVMValueMetadataEntry *LLVMGlobalCopyAllMetadata(LLVMValueRef Value, size_t *NumEntries) { return llvm_getMetadata(NumEntries, [&Value](MetadataEntries &Entries) { if (Instruction *Instr = dyn_cast(unwrap(Value))) { Instr->getAllMetadata(Entries); } else { unwrap(Value)->getAllMetadata(Entries); } }); } unsigned LLVMValueMetadataEntriesGetKind(LLVMValueMetadataEntry *Entries, unsigned Index) { LLVMOpaqueValueMetadataEntry MVE = static_cast(Entries[Index]); return MVE.Kind; } LLVMMetadataRef LLVMValueMetadataEntriesGetMetadata(LLVMValueMetadataEntry *Entries, unsigned Index) { LLVMOpaqueValueMetadataEntry MVE = static_cast(Entries[Index]); return MVE.Metadata; } void LLVMDisposeValueMetadataEntries(LLVMValueMetadataEntry *Entries) { free(Entries); } void LLVMGlobalSetMetadata(LLVMValueRef Global, unsigned Kind, LLVMMetadataRef MD) { unwrap(Global)->setMetadata(Kind, unwrap(MD)); } void LLVMGlobalEraseMetadata(LLVMValueRef Global, unsigned Kind) { unwrap(Global)->eraseMetadata(Kind); } void LLVMGlobalClearMetadata(LLVMValueRef Global) { unwrap(Global)->clearMetadata(); } /*--.. Operations on global variables ......................................--*/ LLVMValueRef LLVMAddGlobal(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name) { return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false, GlobalValue::ExternalLinkage, nullptr, Name)); } LLVMValueRef LLVMAddGlobalInAddressSpace(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name, unsigned AddressSpace) { return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false, GlobalValue::ExternalLinkage, nullptr, Name, nullptr, GlobalVariable::NotThreadLocal, AddressSpace)); } LLVMValueRef LLVMGetNamedGlobal(LLVMModuleRef M, const char *Name) { return wrap(unwrap(M)->getNamedGlobal(Name)); } LLVMValueRef LLVMGetFirstGlobal(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::global_iterator I = Mod->global_begin(); if (I == Mod->global_end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetLastGlobal(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::global_iterator I = Mod->global_end(); if (I == Mod->global_begin()) return nullptr; return wrap(&*--I); } LLVMValueRef LLVMGetNextGlobal(LLVMValueRef GlobalVar) { GlobalVariable *GV = unwrap(GlobalVar); Module::global_iterator I(GV); if (++I == GV->getParent()->global_end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetPreviousGlobal(LLVMValueRef GlobalVar) { GlobalVariable *GV = unwrap(GlobalVar); Module::global_iterator I(GV); if (I == GV->getParent()->global_begin()) return nullptr; return wrap(&*--I); } void LLVMDeleteGlobal(LLVMValueRef GlobalVar) { unwrap(GlobalVar)->eraseFromParent(); } LLVMValueRef LLVMGetInitializer(LLVMValueRef GlobalVar) { GlobalVariable* GV = unwrap(GlobalVar); if ( !GV->hasInitializer() ) return nullptr; return wrap(GV->getInitializer()); } void LLVMSetInitializer(LLVMValueRef GlobalVar, LLVMValueRef ConstantVal) { unwrap(GlobalVar) ->setInitializer(unwrap(ConstantVal)); } LLVMBool LLVMIsThreadLocal(LLVMValueRef GlobalVar) { return unwrap(GlobalVar)->isThreadLocal(); } void LLVMSetThreadLocal(LLVMValueRef GlobalVar, LLVMBool IsThreadLocal) { unwrap(GlobalVar)->setThreadLocal(IsThreadLocal != 0); } LLVMBool LLVMIsGlobalConstant(LLVMValueRef GlobalVar) { return unwrap(GlobalVar)->isConstant(); } void LLVMSetGlobalConstant(LLVMValueRef GlobalVar, LLVMBool IsConstant) { unwrap(GlobalVar)->setConstant(IsConstant != 0); } LLVMThreadLocalMode LLVMGetThreadLocalMode(LLVMValueRef GlobalVar) { switch (unwrap(GlobalVar)->getThreadLocalMode()) { case GlobalVariable::NotThreadLocal: return LLVMNotThreadLocal; case GlobalVariable::GeneralDynamicTLSModel: return LLVMGeneralDynamicTLSModel; case GlobalVariable::LocalDynamicTLSModel: return LLVMLocalDynamicTLSModel; case GlobalVariable::InitialExecTLSModel: return LLVMInitialExecTLSModel; case GlobalVariable::LocalExecTLSModel: return LLVMLocalExecTLSModel; } llvm_unreachable("Invalid GlobalVariable thread local mode"); } void LLVMSetThreadLocalMode(LLVMValueRef GlobalVar, LLVMThreadLocalMode Mode) { GlobalVariable *GV = unwrap(GlobalVar); switch (Mode) { case LLVMNotThreadLocal: GV->setThreadLocalMode(GlobalVariable::NotThreadLocal); break; case LLVMGeneralDynamicTLSModel: GV->setThreadLocalMode(GlobalVariable::GeneralDynamicTLSModel); break; case LLVMLocalDynamicTLSModel: GV->setThreadLocalMode(GlobalVariable::LocalDynamicTLSModel); break; case LLVMInitialExecTLSModel: GV->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); break; case LLVMLocalExecTLSModel: GV->setThreadLocalMode(GlobalVariable::LocalExecTLSModel); break; } } LLVMBool LLVMIsExternallyInitialized(LLVMValueRef GlobalVar) { return unwrap(GlobalVar)->isExternallyInitialized(); } void LLVMSetExternallyInitialized(LLVMValueRef GlobalVar, LLVMBool IsExtInit) { unwrap(GlobalVar)->setExternallyInitialized(IsExtInit); } /*--.. Operations on aliases ......................................--*/ LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee, const char *Name) { auto *PTy = cast(unwrap(Ty)); return wrap(GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(), GlobalValue::ExternalLinkage, Name, unwrap(Aliasee), unwrap(M))); } LLVMValueRef LLVMGetNamedGlobalAlias(LLVMModuleRef M, const char *Name, size_t NameLen) { return wrap(unwrap(M)->getNamedAlias(Name)); } LLVMValueRef LLVMGetFirstGlobalAlias(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::alias_iterator I = Mod->alias_begin(); if (I == Mod->alias_end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetLastGlobalAlias(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::alias_iterator I = Mod->alias_end(); if (I == Mod->alias_begin()) return nullptr; return wrap(&*--I); } LLVMValueRef LLVMGetNextGlobalAlias(LLVMValueRef GA) { GlobalAlias *Alias = unwrap(GA); Module::alias_iterator I(Alias); if (++I == Alias->getParent()->alias_end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetPreviousGlobalAlias(LLVMValueRef GA) { GlobalAlias *Alias = unwrap(GA); Module::alias_iterator I(Alias); if (I == Alias->getParent()->alias_begin()) return nullptr; return wrap(&*--I); } LLVMValueRef LLVMAliasGetAliasee(LLVMValueRef Alias) { return wrap(unwrap(Alias)->getAliasee()); } void LLVMAliasSetAliasee(LLVMValueRef Alias, LLVMValueRef Aliasee) { unwrap(Alias)->setAliasee(unwrap(Aliasee)); } /*--.. Operations on functions .............................................--*/ LLVMValueRef LLVMAddFunction(LLVMModuleRef M, const char *Name, LLVMTypeRef FunctionTy) { return wrap(Function::Create(unwrap(FunctionTy), GlobalValue::ExternalLinkage, Name, unwrap(M))); } LLVMValueRef LLVMGetNamedFunction(LLVMModuleRef M, const char *Name) { return wrap(unwrap(M)->getFunction(Name)); } LLVMValueRef LLVMGetFirstFunction(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::iterator I = Mod->begin(); if (I == Mod->end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetLastFunction(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::iterator I = Mod->end(); if (I == Mod->begin()) return nullptr; return wrap(&*--I); } LLVMValueRef LLVMGetNextFunction(LLVMValueRef Fn) { Function *Func = unwrap(Fn); Module::iterator I(Func); if (++I == Func->getParent()->end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetPreviousFunction(LLVMValueRef Fn) { Function *Func = unwrap(Fn); Module::iterator I(Func); if (I == Func->getParent()->begin()) return nullptr; return wrap(&*--I); } void LLVMDeleteFunction(LLVMValueRef Fn) { unwrap(Fn)->eraseFromParent(); } LLVMBool LLVMHasPersonalityFn(LLVMValueRef Fn) { return unwrap(Fn)->hasPersonalityFn(); } LLVMValueRef LLVMGetPersonalityFn(LLVMValueRef Fn) { return wrap(unwrap(Fn)->getPersonalityFn()); } void LLVMSetPersonalityFn(LLVMValueRef Fn, LLVMValueRef PersonalityFn) { unwrap(Fn)->setPersonalityFn(unwrap(PersonalityFn)); } unsigned LLVMGetIntrinsicID(LLVMValueRef Fn) { if (Function *F = dyn_cast(unwrap(Fn))) return F->getIntrinsicID(); return 0; } static Intrinsic::ID llvm_map_to_intrinsic_id(unsigned ID) { assert(ID < llvm::Intrinsic::num_intrinsics && "Intrinsic ID out of range"); return llvm::Intrinsic::ID(ID); } LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod, unsigned ID, LLVMTypeRef *ParamTypes, size_t ParamCount) { ArrayRef Tys(unwrap(ParamTypes), ParamCount); auto IID = llvm_map_to_intrinsic_id(ID); return wrap(llvm::Intrinsic::getDeclaration(unwrap(Mod), IID, Tys)); } const char *LLVMIntrinsicGetName(unsigned ID, size_t *NameLength) { auto IID = llvm_map_to_intrinsic_id(ID); auto Str = llvm::Intrinsic::getName(IID); *NameLength = Str.size(); return Str.data(); } LLVMTypeRef LLVMIntrinsicGetType(LLVMContextRef Ctx, unsigned ID, LLVMTypeRef *ParamTypes, size_t ParamCount) { auto IID = llvm_map_to_intrinsic_id(ID); ArrayRef Tys(unwrap(ParamTypes), ParamCount); return wrap(llvm::Intrinsic::getType(*unwrap(Ctx), IID, Tys)); } const char *LLVMIntrinsicCopyOverloadedName(unsigned ID, LLVMTypeRef *ParamTypes, size_t ParamCount, size_t *NameLength) { auto IID = llvm_map_to_intrinsic_id(ID); ArrayRef Tys(unwrap(ParamTypes), ParamCount); auto Str = llvm::Intrinsic::getName(IID, Tys); *NameLength = Str.length(); return strdup(Str.c_str()); } unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen) { return Function::lookupIntrinsicID({Name, NameLen}); } LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID) { auto IID = llvm_map_to_intrinsic_id(ID); return llvm::Intrinsic::isOverloaded(IID); } unsigned LLVMGetFunctionCallConv(LLVMValueRef Fn) { return unwrap(Fn)->getCallingConv(); } void LLVMSetFunctionCallConv(LLVMValueRef Fn, unsigned CC) { return unwrap(Fn)->setCallingConv( static_cast(CC)); } const char *LLVMGetGC(LLVMValueRef Fn) { Function *F = unwrap(Fn); return F->hasGC()? F->getGC().c_str() : nullptr; } void LLVMSetGC(LLVMValueRef Fn, const char *GC) { Function *F = unwrap(Fn); if (GC) F->setGC(GC); else F->clearGC(); } void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, LLVMAttributeRef A) { unwrap(F)->addAttribute(Idx, unwrap(A)); } unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) { auto AS = unwrap(F)->getAttributes().getAttributes(Idx); return AS.getNumAttributes(); } void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, LLVMAttributeRef *Attrs) { auto AS = unwrap(F)->getAttributes().getAttributes(Idx); for (auto A : AS) *Attrs++ = wrap(A); } LLVMAttributeRef LLVMGetEnumAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, unsigned KindID) { return wrap(unwrap(F)->getAttribute(Idx, (Attribute::AttrKind)KindID)); } LLVMAttributeRef LLVMGetStringAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, const char *K, unsigned KLen) { return wrap(unwrap(F)->getAttribute(Idx, StringRef(K, KLen))); } void LLVMRemoveEnumAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, unsigned KindID) { unwrap(F)->removeAttribute(Idx, (Attribute::AttrKind)KindID); } void LLVMRemoveStringAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx, const char *K, unsigned KLen) { unwrap(F)->removeAttribute(Idx, StringRef(K, KLen)); } void LLVMAddTargetDependentFunctionAttr(LLVMValueRef Fn, const char *A, const char *V) { Function *Func = unwrap(Fn); Attribute Attr = Attribute::get(Func->getContext(), A, V); Func->addAttribute(AttributeList::FunctionIndex, Attr); } /*--.. Operations on parameters ............................................--*/ unsigned LLVMCountParams(LLVMValueRef FnRef) { // This function is strictly redundant to // LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(FnRef))) return unwrap(FnRef)->arg_size(); } void LLVMGetParams(LLVMValueRef FnRef, LLVMValueRef *ParamRefs) { Function *Fn = unwrap(FnRef); for (Function::arg_iterator I = Fn->arg_begin(), E = Fn->arg_end(); I != E; I++) *ParamRefs++ = wrap(&*I); } LLVMValueRef LLVMGetParam(LLVMValueRef FnRef, unsigned index) { Function *Fn = unwrap(FnRef); return wrap(&Fn->arg_begin()[index]); } LLVMValueRef LLVMGetParamParent(LLVMValueRef V) { return wrap(unwrap(V)->getParent()); } LLVMValueRef LLVMGetFirstParam(LLVMValueRef Fn) { Function *Func = unwrap(Fn); Function::arg_iterator I = Func->arg_begin(); if (I == Func->arg_end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) { Function *Func = unwrap(Fn); Function::arg_iterator I = Func->arg_end(); if (I == Func->arg_begin()) return nullptr; return wrap(&*--I); } LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) { Argument *A = unwrap(Arg); Function *Fn = A->getParent(); if (A->getArgNo() + 1 >= Fn->arg_size()) return nullptr; return wrap(&Fn->arg_begin()[A->getArgNo() + 1]); } LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) { Argument *A = unwrap(Arg); if (A->getArgNo() == 0) return nullptr; return wrap(&A->getParent()->arg_begin()[A->getArgNo() - 1]); } void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) { Argument *A = unwrap(Arg); A->addAttr(Attribute::getWithAlignment(A->getContext(), align)); } /*--.. Operations on ifuncs ................................................--*/ LLVMValueRef LLVMAddGlobalIFunc(LLVMModuleRef M, const char *Name, size_t NameLen, LLVMTypeRef Ty, unsigned AddrSpace, LLVMValueRef Resolver) { return wrap(GlobalIFunc::create(unwrap(Ty), AddrSpace, GlobalValue::ExternalLinkage, StringRef(Name, NameLen), unwrap(Resolver), unwrap(M))); } LLVMValueRef LLVMGetNamedGlobalIFunc(LLVMModuleRef M, const char *Name, size_t NameLen) { return wrap(unwrap(M)->getNamedIFunc(StringRef(Name, NameLen))); } LLVMValueRef LLVMGetFirstGlobalIFunc(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::ifunc_iterator I = Mod->ifunc_begin(); if (I == Mod->ifunc_end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetLastGlobalIFunc(LLVMModuleRef M) { Module *Mod = unwrap(M); Module::ifunc_iterator I = Mod->ifunc_end(); if (I == Mod->ifunc_begin()) return nullptr; return wrap(&*--I); } LLVMValueRef LLVMGetNextGlobalIFunc(LLVMValueRef IFunc) { GlobalIFunc *GIF = unwrap(IFunc); Module::ifunc_iterator I(GIF); if (++I == GIF->getParent()->ifunc_end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetPreviousGlobalIFunc(LLVMValueRef IFunc) { GlobalIFunc *GIF = unwrap(IFunc); Module::ifunc_iterator I(GIF); if (I == GIF->getParent()->ifunc_begin()) return nullptr; return wrap(&*--I); } LLVMValueRef LLVMGetGlobalIFuncResolver(LLVMValueRef IFunc) { return wrap(unwrap(IFunc)->getResolver()); } void LLVMSetGlobalIFuncResolver(LLVMValueRef IFunc, LLVMValueRef Resolver) { unwrap(IFunc)->setResolver(unwrap(Resolver)); } void LLVMEraseGlobalIFunc(LLVMValueRef IFunc) { unwrap(IFunc)->eraseFromParent(); } void LLVMRemoveGlobalIFunc(LLVMValueRef IFunc) { unwrap(IFunc)->removeFromParent(); } /*--.. Operations on basic blocks ..........................................--*/ LLVMValueRef LLVMBasicBlockAsValue(LLVMBasicBlockRef BB) { return wrap(static_cast(unwrap(BB))); } LLVMBool LLVMValueIsBasicBlock(LLVMValueRef Val) { return isa(unwrap(Val)); } LLVMBasicBlockRef LLVMValueAsBasicBlock(LLVMValueRef Val) { return wrap(unwrap(Val)); } const char *LLVMGetBasicBlockName(LLVMBasicBlockRef BB) { return unwrap(BB)->getName().data(); } LLVMValueRef LLVMGetBasicBlockParent(LLVMBasicBlockRef BB) { return wrap(unwrap(BB)->getParent()); } LLVMValueRef LLVMGetBasicBlockTerminator(LLVMBasicBlockRef BB) { return wrap(unwrap(BB)->getTerminator()); } unsigned LLVMCountBasicBlocks(LLVMValueRef FnRef) { return unwrap(FnRef)->size(); } void LLVMGetBasicBlocks(LLVMValueRef FnRef, LLVMBasicBlockRef *BasicBlocksRefs){ Function *Fn = unwrap(FnRef); for (BasicBlock &BB : *Fn) *BasicBlocksRefs++ = wrap(&BB); } LLVMBasicBlockRef LLVMGetEntryBasicBlock(LLVMValueRef Fn) { return wrap(&unwrap(Fn)->getEntryBlock()); } LLVMBasicBlockRef LLVMGetFirstBasicBlock(LLVMValueRef Fn) { Function *Func = unwrap(Fn); Function::iterator I = Func->begin(); if (I == Func->end()) return nullptr; return wrap(&*I); } LLVMBasicBlockRef LLVMGetLastBasicBlock(LLVMValueRef Fn) { Function *Func = unwrap(Fn); Function::iterator I = Func->end(); if (I == Func->begin()) return nullptr; return wrap(&*--I); } LLVMBasicBlockRef LLVMGetNextBasicBlock(LLVMBasicBlockRef BB) { BasicBlock *Block = unwrap(BB); Function::iterator I(Block); if (++I == Block->getParent()->end()) return nullptr; return wrap(&*I); } LLVMBasicBlockRef LLVMGetPreviousBasicBlock(LLVMBasicBlockRef BB) { BasicBlock *Block = unwrap(BB); Function::iterator I(Block); if (I == Block->getParent()->begin()) return nullptr; return wrap(&*--I); } LLVMBasicBlockRef LLVMCreateBasicBlockInContext(LLVMContextRef C, const char *Name) { return wrap(llvm::BasicBlock::Create(*unwrap(C), Name)); } void LLVMInsertExistingBasicBlockAfterInsertBlock(LLVMBuilderRef Builder, LLVMBasicBlockRef BB) { BasicBlock *ToInsert = unwrap(BB); BasicBlock *CurBB = unwrap(Builder)->GetInsertBlock(); assert(CurBB && "current insertion point is invalid!"); CurBB->getParent()->getBasicBlockList().insertAfter(CurBB->getIterator(), ToInsert); } void LLVMAppendExistingBasicBlock(LLVMValueRef Fn, LLVMBasicBlockRef BB) { unwrap(Fn)->getBasicBlockList().push_back(unwrap(BB)); } LLVMBasicBlockRef LLVMAppendBasicBlockInContext(LLVMContextRef C, LLVMValueRef FnRef, const char *Name) { return wrap(BasicBlock::Create(*unwrap(C), Name, unwrap(FnRef))); } LLVMBasicBlockRef LLVMAppendBasicBlock(LLVMValueRef FnRef, const char *Name) { return LLVMAppendBasicBlockInContext(LLVMGetGlobalContext(), FnRef, Name); } LLVMBasicBlockRef LLVMInsertBasicBlockInContext(LLVMContextRef C, LLVMBasicBlockRef BBRef, const char *Name) { BasicBlock *BB = unwrap(BBRef); return wrap(BasicBlock::Create(*unwrap(C), Name, BB->getParent(), BB)); } LLVMBasicBlockRef LLVMInsertBasicBlock(LLVMBasicBlockRef BBRef, const char *Name) { return LLVMInsertBasicBlockInContext(LLVMGetGlobalContext(), BBRef, Name); } void LLVMDeleteBasicBlock(LLVMBasicBlockRef BBRef) { unwrap(BBRef)->eraseFromParent(); } void LLVMRemoveBasicBlockFromParent(LLVMBasicBlockRef BBRef) { unwrap(BBRef)->removeFromParent(); } void LLVMMoveBasicBlockBefore(LLVMBasicBlockRef BB, LLVMBasicBlockRef MovePos) { unwrap(BB)->moveBefore(unwrap(MovePos)); } void LLVMMoveBasicBlockAfter(LLVMBasicBlockRef BB, LLVMBasicBlockRef MovePos) { unwrap(BB)->moveAfter(unwrap(MovePos)); } /*--.. Operations on instructions ..........................................--*/ LLVMBasicBlockRef LLVMGetInstructionParent(LLVMValueRef Inst) { return wrap(unwrap(Inst)->getParent()); } LLVMValueRef LLVMGetFirstInstruction(LLVMBasicBlockRef BB) { BasicBlock *Block = unwrap(BB); BasicBlock::iterator I = Block->begin(); if (I == Block->end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB) { BasicBlock *Block = unwrap(BB); BasicBlock::iterator I = Block->end(); if (I == Block->begin()) return nullptr; return wrap(&*--I); } LLVMValueRef LLVMGetNextInstruction(LLVMValueRef Inst) { Instruction *Instr = unwrap(Inst); BasicBlock::iterator I(Instr); if (++I == Instr->getParent()->end()) return nullptr; return wrap(&*I); } LLVMValueRef LLVMGetPreviousInstruction(LLVMValueRef Inst) { Instruction *Instr = unwrap(Inst); BasicBlock::iterator I(Instr); if (I == Instr->getParent()->begin()) return nullptr; return wrap(&*--I); } void LLVMInstructionRemoveFromParent(LLVMValueRef Inst) { unwrap(Inst)->removeFromParent(); } void LLVMInstructionEraseFromParent(LLVMValueRef Inst) { unwrap(Inst)->eraseFromParent(); } LLVMIntPredicate LLVMGetICmpPredicate(LLVMValueRef Inst) { if (ICmpInst *I = dyn_cast(unwrap(Inst))) return (LLVMIntPredicate)I->getPredicate(); if (ConstantExpr *CE = dyn_cast(unwrap(Inst))) if (CE->getOpcode() == Instruction::ICmp) return (LLVMIntPredicate)CE->getPredicate(); return (LLVMIntPredicate)0; } LLVMRealPredicate LLVMGetFCmpPredicate(LLVMValueRef Inst) { if (FCmpInst *I = dyn_cast(unwrap(Inst))) return (LLVMRealPredicate)I->getPredicate(); if (ConstantExpr *CE = dyn_cast(unwrap(Inst))) if (CE->getOpcode() == Instruction::FCmp) return (LLVMRealPredicate)CE->getPredicate(); return (LLVMRealPredicate)0; } LLVMOpcode LLVMGetInstructionOpcode(LLVMValueRef Inst) { if (Instruction *C = dyn_cast(unwrap(Inst))) return map_to_llvmopcode(C->getOpcode()); return (LLVMOpcode)0; } LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst) { if (Instruction *C = dyn_cast(unwrap(Inst))) return wrap(C->clone()); return nullptr; } LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst) { Instruction *I = dyn_cast(unwrap(Inst)); return (I && I->isTerminator()) ? wrap(I) : nullptr; } unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) { if (FuncletPadInst *FPI = dyn_cast(unwrap(Instr))) { return FPI->getNumArgOperands(); } return unwrap(Instr)->getNumArgOperands(); } /*--.. Call and invoke instructions ........................................--*/ unsigned LLVMGetInstructionCallConv(LLVMValueRef Instr) { return unwrap(Instr)->getCallingConv(); } void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) { return unwrap(Instr)->setCallingConv( static_cast(CC)); } void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index, unsigned align) { auto *Call = unwrap(Instr); Attribute AlignAttr = Attribute::getWithAlignment(Call->getContext(), align); Call->addAttribute(index, AlignAttr); } void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, LLVMAttributeRef A) { unwrap(C)->addAttribute(Idx, unwrap(A)); } unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C, LLVMAttributeIndex Idx) { auto *Call = unwrap(C); auto AS = Call->getAttributes().getAttributes(Idx); return AS.getNumAttributes(); } void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx, LLVMAttributeRef *Attrs) { auto *Call = unwrap(C); auto AS = Call->getAttributes().getAttributes(Idx); for (auto A : AS) *Attrs++ = wrap(A); } LLVMAttributeRef LLVMGetCallSiteEnumAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, unsigned KindID) { return wrap( unwrap(C)->getAttribute(Idx, (Attribute::AttrKind)KindID)); } LLVMAttributeRef LLVMGetCallSiteStringAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, const char *K, unsigned KLen) { return wrap(unwrap(C)->getAttribute(Idx, StringRef(K, KLen))); } void LLVMRemoveCallSiteEnumAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, unsigned KindID) { unwrap(C)->removeAttribute(Idx, (Attribute::AttrKind)KindID); } void LLVMRemoveCallSiteStringAttribute(LLVMValueRef C, LLVMAttributeIndex Idx, const char *K, unsigned KLen) { unwrap(C)->removeAttribute(Idx, StringRef(K, KLen)); } LLVMValueRef LLVMGetCalledValue(LLVMValueRef Instr) { return wrap(unwrap(Instr)->getCalledValue()); } LLVMTypeRef LLVMGetCalledFunctionType(LLVMValueRef Instr) { return wrap(unwrap(Instr)->getFunctionType()); } /*--.. Operations on call instructions (only) ..............................--*/ LLVMBool LLVMIsTailCall(LLVMValueRef Call) { return unwrap(Call)->isTailCall(); } void LLVMSetTailCall(LLVMValueRef Call, LLVMBool isTailCall) { unwrap(Call)->setTailCall(isTailCall); } /*--.. Operations on invoke instructions (only) ............................--*/ LLVMBasicBlockRef LLVMGetNormalDest(LLVMValueRef Invoke) { return wrap(unwrap(Invoke)->getNormalDest()); } LLVMBasicBlockRef LLVMGetUnwindDest(LLVMValueRef Invoke) { if (CleanupReturnInst *CRI = dyn_cast(unwrap(Invoke))) { return wrap(CRI->getUnwindDest()); } else if (CatchSwitchInst *CSI = dyn_cast(unwrap(Invoke))) { return wrap(CSI->getUnwindDest()); } return wrap(unwrap(Invoke)->getUnwindDest()); } void LLVMSetNormalDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) { unwrap(Invoke)->setNormalDest(unwrap(B)); } void LLVMSetUnwindDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) { if (CleanupReturnInst *CRI = dyn_cast(unwrap(Invoke))) { return CRI->setUnwindDest(unwrap(B)); } else if (CatchSwitchInst *CSI = dyn_cast(unwrap(Invoke))) { return CSI->setUnwindDest(unwrap(B)); } unwrap(Invoke)->setUnwindDest(unwrap(B)); } /*--.. Operations on terminators ...........................................--*/ unsigned LLVMGetNumSuccessors(LLVMValueRef Term) { return unwrap(Term)->getNumSuccessors(); } LLVMBasicBlockRef LLVMGetSuccessor(LLVMValueRef Term, unsigned i) { return wrap(unwrap(Term)->getSuccessor(i)); } void LLVMSetSuccessor(LLVMValueRef Term, unsigned i, LLVMBasicBlockRef block) { return unwrap(Term)->setSuccessor(i, unwrap(block)); } /*--.. Operations on branch instructions (only) ............................--*/ LLVMBool LLVMIsConditional(LLVMValueRef Branch) { return unwrap(Branch)->isConditional(); } LLVMValueRef LLVMGetCondition(LLVMValueRef Branch) { return wrap(unwrap(Branch)->getCondition()); } void LLVMSetCondition(LLVMValueRef Branch, LLVMValueRef Cond) { return unwrap(Branch)->setCondition(unwrap(Cond)); } /*--.. Operations on switch instructions (only) ............................--*/ LLVMBasicBlockRef LLVMGetSwitchDefaultDest(LLVMValueRef Switch) { return wrap(unwrap(Switch)->getDefaultDest()); } /*--.. Operations on alloca instructions (only) ............................--*/ LLVMTypeRef LLVMGetAllocatedType(LLVMValueRef Alloca) { return wrap(unwrap(Alloca)->getAllocatedType()); } /*--.. Operations on gep instructions (only) ...............................--*/ LLVMBool LLVMIsInBounds(LLVMValueRef GEP) { return unwrap(GEP)->isInBounds(); } void LLVMSetIsInBounds(LLVMValueRef GEP, LLVMBool InBounds) { return unwrap(GEP)->setIsInBounds(InBounds); } /*--.. Operations on phi nodes .............................................--*/ void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues, LLVMBasicBlockRef *IncomingBlocks, unsigned Count) { PHINode *PhiVal = unwrap(PhiNode); for (unsigned I = 0; I != Count; ++I) PhiVal->addIncoming(unwrap(IncomingValues[I]), unwrap(IncomingBlocks[I])); } unsigned LLVMCountIncoming(LLVMValueRef PhiNode) { return unwrap(PhiNode)->getNumIncomingValues(); } LLVMValueRef LLVMGetIncomingValue(LLVMValueRef PhiNode, unsigned Index) { return wrap(unwrap(PhiNode)->getIncomingValue(Index)); } LLVMBasicBlockRef LLVMGetIncomingBlock(LLVMValueRef PhiNode, unsigned Index) { return wrap(unwrap(PhiNode)->getIncomingBlock(Index)); } /*--.. Operations on extractvalue and insertvalue nodes ....................--*/ unsigned LLVMGetNumIndices(LLVMValueRef Inst) { auto *I = unwrap(Inst); if (auto *GEP = dyn_cast(I)) return GEP->getNumIndices(); if (auto *EV = dyn_cast(I)) return EV->getNumIndices(); if (auto *IV = dyn_cast(I)) return IV->getNumIndices(); if (auto *CE = dyn_cast(I)) return CE->getIndices().size(); llvm_unreachable( "LLVMGetNumIndices applies only to extractvalue and insertvalue!"); } const unsigned *LLVMGetIndices(LLVMValueRef Inst) { auto *I = unwrap(Inst); if (auto *EV = dyn_cast(I)) return EV->getIndices().data(); if (auto *IV = dyn_cast(I)) return IV->getIndices().data(); if (auto *CE = dyn_cast(I)) return CE->getIndices().data(); llvm_unreachable( "LLVMGetIndices applies only to extractvalue and insertvalue!"); } /*===-- Instruction builders ----------------------------------------------===*/ LLVMBuilderRef LLVMCreateBuilderInContext(LLVMContextRef C) { return wrap(new IRBuilder<>(*unwrap(C))); } LLVMBuilderRef LLVMCreateBuilder(void) { return LLVMCreateBuilderInContext(LLVMGetGlobalContext()); } void LLVMPositionBuilder(LLVMBuilderRef Builder, LLVMBasicBlockRef Block, LLVMValueRef Instr) { BasicBlock *BB = unwrap(Block); auto I = Instr ? unwrap(Instr)->getIterator() : BB->end(); unwrap(Builder)->SetInsertPoint(BB, I); } void LLVMPositionBuilderBefore(LLVMBuilderRef Builder, LLVMValueRef Instr) { Instruction *I = unwrap(Instr); unwrap(Builder)->SetInsertPoint(I->getParent(), I->getIterator()); } void LLVMPositionBuilderAtEnd(LLVMBuilderRef Builder, LLVMBasicBlockRef Block) { BasicBlock *BB = unwrap(Block); unwrap(Builder)->SetInsertPoint(BB); } LLVMBasicBlockRef LLVMGetInsertBlock(LLVMBuilderRef Builder) { return wrap(unwrap(Builder)->GetInsertBlock()); } void LLVMClearInsertionPosition(LLVMBuilderRef Builder) { unwrap(Builder)->ClearInsertionPoint(); } void LLVMInsertIntoBuilder(LLVMBuilderRef Builder, LLVMValueRef Instr) { unwrap(Builder)->Insert(unwrap(Instr)); } void LLVMInsertIntoBuilderWithName(LLVMBuilderRef Builder, LLVMValueRef Instr, const char *Name) { unwrap(Builder)->Insert(unwrap(Instr), Name); } void LLVMDisposeBuilder(LLVMBuilderRef Builder) { delete unwrap(Builder); } /*--.. Metadata builders ...................................................--*/ LLVMMetadataRef LLVMGetCurrentDebugLocation2(LLVMBuilderRef Builder) { return wrap(unwrap(Builder)->getCurrentDebugLocation().getAsMDNode()); } void LLVMSetCurrentDebugLocation2(LLVMBuilderRef Builder, LLVMMetadataRef Loc) { if (Loc) unwrap(Builder)->SetCurrentDebugLocation(DebugLoc(unwrap(Loc))); else unwrap(Builder)->SetCurrentDebugLocation(DebugLoc()); } void LLVMSetCurrentDebugLocation(LLVMBuilderRef Builder, LLVMValueRef L) { MDNode *Loc = L ? cast(unwrap(L)->getMetadata()) : nullptr; unwrap(Builder)->SetCurrentDebugLocation(DebugLoc(Loc)); } LLVMValueRef LLVMGetCurrentDebugLocation(LLVMBuilderRef Builder) { LLVMContext &Context = unwrap(Builder)->getContext(); return wrap(MetadataAsValue::get( Context, unwrap(Builder)->getCurrentDebugLocation().getAsMDNode())); } void LLVMSetInstDebugLocation(LLVMBuilderRef Builder, LLVMValueRef Inst) { unwrap(Builder)->SetInstDebugLocation(unwrap(Inst)); } void LLVMBuilderSetDefaultFPMathTag(LLVMBuilderRef Builder, LLVMMetadataRef FPMathTag) { unwrap(Builder)->setDefaultFPMathTag(FPMathTag ? unwrap(FPMathTag) : nullptr); } LLVMMetadataRef LLVMBuilderGetDefaultFPMathTag(LLVMBuilderRef Builder) { return wrap(unwrap(Builder)->getDefaultFPMathTag()); } /*--.. Instruction builders ................................................--*/ LLVMValueRef LLVMBuildRetVoid(LLVMBuilderRef B) { return wrap(unwrap(B)->CreateRetVoid()); } LLVMValueRef LLVMBuildRet(LLVMBuilderRef B, LLVMValueRef V) { return wrap(unwrap(B)->CreateRet(unwrap(V))); } LLVMValueRef LLVMBuildAggregateRet(LLVMBuilderRef B, LLVMValueRef *RetVals, unsigned N) { return wrap(unwrap(B)->CreateAggregateRet(unwrap(RetVals), N)); } LLVMValueRef LLVMBuildBr(LLVMBuilderRef B, LLVMBasicBlockRef Dest) { return wrap(unwrap(B)->CreateBr(unwrap(Dest))); } LLVMValueRef LLVMBuildCondBr(LLVMBuilderRef B, LLVMValueRef If, LLVMBasicBlockRef Then, LLVMBasicBlockRef Else) { return wrap(unwrap(B)->CreateCondBr(unwrap(If), unwrap(Then), unwrap(Else))); } LLVMValueRef LLVMBuildSwitch(LLVMBuilderRef B, LLVMValueRef V, LLVMBasicBlockRef Else, unsigned NumCases) { return wrap(unwrap(B)->CreateSwitch(unwrap(V), unwrap(Else), NumCases)); } LLVMValueRef LLVMBuildIndirectBr(LLVMBuilderRef B, LLVMValueRef Addr, unsigned NumDests) { return wrap(unwrap(B)->CreateIndirectBr(unwrap(Addr), NumDests)); } LLVMValueRef LLVMBuildInvoke(LLVMBuilderRef B, LLVMValueRef Fn, LLVMValueRef *Args, unsigned NumArgs, LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch, const char *Name) { Value *V = unwrap(Fn); FunctionType *FnT = cast(cast(V->getType())->getElementType()); return wrap( unwrap(B)->CreateInvoke(FnT, unwrap(Fn), unwrap(Then), unwrap(Catch), makeArrayRef(unwrap(Args), NumArgs), Name)); } LLVMValueRef LLVMBuildInvoke2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn, LLVMValueRef *Args, unsigned NumArgs, LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch, const char *Name) { return wrap(unwrap(B)->CreateInvoke( unwrap(Ty), unwrap(Fn), unwrap(Then), unwrap(Catch), makeArrayRef(unwrap(Args), NumArgs), Name)); } LLVMValueRef LLVMBuildLandingPad(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef PersFn, unsigned NumClauses, const char *Name) { // The personality used to live on the landingpad instruction, but now it // lives on the parent function. For compatibility, take the provided // personality and put it on the parent function. if (PersFn) unwrap(B)->GetInsertBlock()->getParent()->setPersonalityFn( cast(unwrap(PersFn))); return wrap(unwrap(B)->CreateLandingPad(unwrap(Ty), NumClauses, Name)); } LLVMValueRef LLVMBuildCatchPad(LLVMBuilderRef B, LLVMValueRef ParentPad, LLVMValueRef *Args, unsigned NumArgs, const char *Name) { return wrap(unwrap(B)->CreateCatchPad(unwrap(ParentPad), makeArrayRef(unwrap(Args), NumArgs), Name)); } LLVMValueRef LLVMBuildCleanupPad(LLVMBuilderRef B, LLVMValueRef ParentPad, LLVMValueRef *Args, unsigned NumArgs, const char *Name) { if (ParentPad == nullptr) { Type *Ty = Type::getTokenTy(unwrap(B)->getContext()); ParentPad = wrap(Constant::getNullValue(Ty)); } return wrap(unwrap(B)->CreateCleanupPad(unwrap(ParentPad), makeArrayRef(unwrap(Args), NumArgs), Name)); } LLVMValueRef LLVMBuildResume(LLVMBuilderRef B, LLVMValueRef Exn) { return wrap(unwrap(B)->CreateResume(unwrap(Exn))); } LLVMValueRef LLVMBuildCatchSwitch(LLVMBuilderRef B, LLVMValueRef ParentPad, LLVMBasicBlockRef UnwindBB, unsigned NumHandlers, const char *Name) { if (ParentPad == nullptr) { Type *Ty = Type::getTokenTy(unwrap(B)->getContext()); ParentPad = wrap(Constant::getNullValue(Ty)); } return wrap(unwrap(B)->CreateCatchSwitch(unwrap(ParentPad), unwrap(UnwindBB), NumHandlers, Name)); } LLVMValueRef LLVMBuildCatchRet(LLVMBuilderRef B, LLVMValueRef CatchPad, LLVMBasicBlockRef BB) { return wrap(unwrap(B)->CreateCatchRet(unwrap(CatchPad), unwrap(BB))); } LLVMValueRef LLVMBuildCleanupRet(LLVMBuilderRef B, LLVMValueRef CatchPad, LLVMBasicBlockRef BB) { return wrap(unwrap(B)->CreateCleanupRet(unwrap(CatchPad), unwrap(BB))); } LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef B) { return wrap(unwrap(B)->CreateUnreachable()); } void LLVMAddCase(LLVMValueRef Switch, LLVMValueRef OnVal, LLVMBasicBlockRef Dest) { unwrap(Switch)->addCase(unwrap(OnVal), unwrap(Dest)); } void LLVMAddDestination(LLVMValueRef IndirectBr, LLVMBasicBlockRef Dest) { unwrap(IndirectBr)->addDestination(unwrap(Dest)); } unsigned LLVMGetNumClauses(LLVMValueRef LandingPad) { return unwrap(LandingPad)->getNumClauses(); } LLVMValueRef LLVMGetClause(LLVMValueRef LandingPad, unsigned Idx) { return wrap(unwrap(LandingPad)->getClause(Idx)); } void LLVMAddClause(LLVMValueRef LandingPad, LLVMValueRef ClauseVal) { unwrap(LandingPad)-> addClause(cast(unwrap(ClauseVal))); } LLVMBool LLVMIsCleanup(LLVMValueRef LandingPad) { return unwrap(LandingPad)->isCleanup(); } void LLVMSetCleanup(LLVMValueRef LandingPad, LLVMBool Val) { unwrap(LandingPad)->setCleanup(Val); } void LLVMAddHandler(LLVMValueRef CatchSwitch, LLVMBasicBlockRef Dest) { unwrap(CatchSwitch)->addHandler(unwrap(Dest)); } unsigned LLVMGetNumHandlers(LLVMValueRef CatchSwitch) { return unwrap(CatchSwitch)->getNumHandlers(); } void LLVMGetHandlers(LLVMValueRef CatchSwitch, LLVMBasicBlockRef *Handlers) { CatchSwitchInst *CSI = unwrap(CatchSwitch); for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(), E = CSI->handler_end(); I != E; ++I) *Handlers++ = wrap(*I); } LLVMValueRef LLVMGetParentCatchSwitch(LLVMValueRef CatchPad) { return wrap(unwrap(CatchPad)->getCatchSwitch()); } void LLVMSetParentCatchSwitch(LLVMValueRef CatchPad, LLVMValueRef CatchSwitch) { unwrap(CatchPad) ->setCatchSwitch(unwrap(CatchSwitch)); } /*--.. Funclets ...........................................................--*/ LLVMValueRef LLVMGetArgOperand(LLVMValueRef Funclet, unsigned i) { return wrap(unwrap(Funclet)->getArgOperand(i)); } void LLVMSetArgOperand(LLVMValueRef Funclet, unsigned i, LLVMValueRef value) { unwrap(Funclet)->setArgOperand(i, unwrap(value)); } /*--.. Arithmetic ..........................................................--*/ LLVMValueRef LLVMBuildAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateAdd(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildNSWAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateNSWAdd(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildNUWAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateNUWAdd(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildFAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateFAdd(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateSub(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildNSWSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateNSWSub(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildNUWSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateNUWSub(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildFSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateFSub(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateMul(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildNSWMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateNSWMul(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildNUWMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateNUWMul(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildFMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateFMul(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildUDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateUDiv(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildExactUDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateExactUDiv(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildSDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateSDiv(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildExactSDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateExactSDiv(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildFDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateFDiv(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildURem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateURem(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildSRem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateSRem(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildFRem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateFRem(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildShl(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateShl(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildLShr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateLShr(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildAShr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateAShr(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildAnd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateAnd(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildOr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateOr(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildXor(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateXor(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildBinOp(LLVMBuilderRef B, LLVMOpcode Op, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateBinOp(Instruction::BinaryOps(map_from_llvmopcode(Op)), unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildNeg(LLVMBuilderRef B, LLVMValueRef V, const char *Name) { return wrap(unwrap(B)->CreateNeg(unwrap(V), Name)); } LLVMValueRef LLVMBuildNSWNeg(LLVMBuilderRef B, LLVMValueRef V, const char *Name) { return wrap(unwrap(B)->CreateNSWNeg(unwrap(V), Name)); } LLVMValueRef LLVMBuildNUWNeg(LLVMBuilderRef B, LLVMValueRef V, const char *Name) { return wrap(unwrap(B)->CreateNUWNeg(unwrap(V), Name)); } LLVMValueRef LLVMBuildFNeg(LLVMBuilderRef B, LLVMValueRef V, const char *Name) { return wrap(unwrap(B)->CreateFNeg(unwrap(V), Name)); } LLVMValueRef LLVMBuildNot(LLVMBuilderRef B, LLVMValueRef V, const char *Name) { return wrap(unwrap(B)->CreateNot(unwrap(V), Name)); } /*--.. Memory ..............................................................--*/ LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty, const char *Name) { Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext()); Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty)); AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy); Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), ITy, unwrap(Ty), AllocSize, nullptr, nullptr, ""); return wrap(unwrap(B)->Insert(Malloc, Twine(Name))); } LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Val, const char *Name) { Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext()); Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty)); AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy); Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), ITy, unwrap(Ty), AllocSize, unwrap(Val), nullptr, ""); return wrap(unwrap(B)->Insert(Malloc, Twine(Name))); } LLVMValueRef LLVMBuildMemSet(LLVMBuilderRef B, LLVMValueRef Ptr, LLVMValueRef Val, LLVMValueRef Len, unsigned Align) { return wrap(unwrap(B)->CreateMemSet(unwrap(Ptr), unwrap(Val), unwrap(Len), Align)); } LLVMValueRef LLVMBuildMemCpy(LLVMBuilderRef B, LLVMValueRef Dst, unsigned DstAlign, LLVMValueRef Src, unsigned SrcAlign, LLVMValueRef Size) { return wrap(unwrap(B)->CreateMemCpy(unwrap(Dst), DstAlign, unwrap(Src), SrcAlign, unwrap(Size))); } LLVMValueRef LLVMBuildMemMove(LLVMBuilderRef B, LLVMValueRef Dst, unsigned DstAlign, LLVMValueRef Src, unsigned SrcAlign, LLVMValueRef Size) { return wrap(unwrap(B)->CreateMemMove(unwrap(Dst), DstAlign, unwrap(Src), SrcAlign, unwrap(Size))); } LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef B, LLVMTypeRef Ty, const char *Name) { return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), nullptr, Name)); } LLVMValueRef LLVMBuildArrayAlloca(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Val, const char *Name) { return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), unwrap(Val), Name)); } LLVMValueRef LLVMBuildFree(LLVMBuilderRef B, LLVMValueRef PointerVal) { return wrap(unwrap(B)->Insert( CallInst::CreateFree(unwrap(PointerVal), unwrap(B)->GetInsertBlock()))); } LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal, const char *Name) { Value *V = unwrap(PointerVal); PointerType *Ty = cast(V->getType()); return wrap(unwrap(B)->CreateLoad(Ty->getElementType(), V, Name)); } LLVMValueRef LLVMBuildLoad2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef PointerVal, const char *Name) { return wrap(unwrap(B)->CreateLoad(unwrap(Ty), unwrap(PointerVal), Name)); } LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val, LLVMValueRef PointerVal) { return wrap(unwrap(B)->CreateStore(unwrap(Val), unwrap(PointerVal))); } static AtomicOrdering mapFromLLVMOrdering(LLVMAtomicOrdering Ordering) { switch (Ordering) { case LLVMAtomicOrderingNotAtomic: return AtomicOrdering::NotAtomic; case LLVMAtomicOrderingUnordered: return AtomicOrdering::Unordered; case LLVMAtomicOrderingMonotonic: return AtomicOrdering::Monotonic; case LLVMAtomicOrderingAcquire: return AtomicOrdering::Acquire; case LLVMAtomicOrderingRelease: return AtomicOrdering::Release; case LLVMAtomicOrderingAcquireRelease: return AtomicOrdering::AcquireRelease; case LLVMAtomicOrderingSequentiallyConsistent: return AtomicOrdering::SequentiallyConsistent; } llvm_unreachable("Invalid LLVMAtomicOrdering value!"); } static LLVMAtomicOrdering mapToLLVMOrdering(AtomicOrdering Ordering) { switch (Ordering) { case AtomicOrdering::NotAtomic: return LLVMAtomicOrderingNotAtomic; case AtomicOrdering::Unordered: return LLVMAtomicOrderingUnordered; case AtomicOrdering::Monotonic: return LLVMAtomicOrderingMonotonic; case AtomicOrdering::Acquire: return LLVMAtomicOrderingAcquire; case AtomicOrdering::Release: return LLVMAtomicOrderingRelease; case AtomicOrdering::AcquireRelease: return LLVMAtomicOrderingAcquireRelease; case AtomicOrdering::SequentiallyConsistent: return LLVMAtomicOrderingSequentiallyConsistent; } llvm_unreachable("Invalid AtomicOrdering value!"); } // TODO: Should this and other atomic instructions support building with // "syncscope"? LLVMValueRef LLVMBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering, LLVMBool isSingleThread, const char *Name) { return wrap( unwrap(B)->CreateFence(mapFromLLVMOrdering(Ordering), isSingleThread ? SyncScope::SingleThread : SyncScope::System, Name)); } LLVMValueRef LLVMBuildGEP(LLVMBuilderRef B, LLVMValueRef Pointer, LLVMValueRef *Indices, unsigned NumIndices, const char *Name) { ArrayRef IdxList(unwrap(Indices), NumIndices); Value *Val = unwrap(Pointer); Type *Ty = cast(Val->getType()->getScalarType())->getElementType(); return wrap(unwrap(B)->CreateGEP(Ty, Val, IdxList, Name)); } LLVMValueRef LLVMBuildGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Pointer, LLVMValueRef *Indices, unsigned NumIndices, const char *Name) { ArrayRef IdxList(unwrap(Indices), NumIndices); return wrap(unwrap(B)->CreateGEP(unwrap(Ty), unwrap(Pointer), IdxList, Name)); } LLVMValueRef LLVMBuildInBoundsGEP(LLVMBuilderRef B, LLVMValueRef Pointer, LLVMValueRef *Indices, unsigned NumIndices, const char *Name) { ArrayRef IdxList(unwrap(Indices), NumIndices); Value *Val = unwrap(Pointer); Type *Ty = cast(Val->getType()->getScalarType())->getElementType(); return wrap(unwrap(B)->CreateInBoundsGEP(Ty, Val, IdxList, Name)); } LLVMValueRef LLVMBuildInBoundsGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Pointer, LLVMValueRef *Indices, unsigned NumIndices, const char *Name) { ArrayRef IdxList(unwrap(Indices), NumIndices); return wrap( unwrap(B)->CreateInBoundsGEP(unwrap(Ty), unwrap(Pointer), IdxList, Name)); } LLVMValueRef LLVMBuildStructGEP(LLVMBuilderRef B, LLVMValueRef Pointer, unsigned Idx, const char *Name) { Value *Val = unwrap(Pointer); Type *Ty = cast(Val->getType()->getScalarType())->getElementType(); return wrap(unwrap(B)->CreateStructGEP(Ty, Val, Idx, Name)); } LLVMValueRef LLVMBuildStructGEP2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Pointer, unsigned Idx, const char *Name) { return wrap( unwrap(B)->CreateStructGEP(unwrap(Ty), unwrap(Pointer), Idx, Name)); } LLVMValueRef LLVMBuildGlobalString(LLVMBuilderRef B, const char *Str, const char *Name) { return wrap(unwrap(B)->CreateGlobalString(Str, Name)); } LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str, const char *Name) { return wrap(unwrap(B)->CreateGlobalStringPtr(Str, Name)); } LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) { Value *P = unwrap(MemAccessInst); if (LoadInst *LI = dyn_cast(P)) return LI->isVolatile(); return cast(P)->isVolatile(); } void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) { Value *P = unwrap(MemAccessInst); if (LoadInst *LI = dyn_cast(P)) return LI->setVolatile(isVolatile); return cast(P)->setVolatile(isVolatile); } LLVMAtomicOrdering LLVMGetOrdering(LLVMValueRef MemAccessInst) { Value *P = unwrap(MemAccessInst); AtomicOrdering O; if (LoadInst *LI = dyn_cast(P)) O = LI->getOrdering(); else O = cast(P)->getOrdering(); return mapToLLVMOrdering(O); } void LLVMSetOrdering(LLVMValueRef MemAccessInst, LLVMAtomicOrdering Ordering) { Value *P = unwrap(MemAccessInst); AtomicOrdering O = mapFromLLVMOrdering(Ordering); if (LoadInst *LI = dyn_cast(P)) return LI->setOrdering(O); return cast(P)->setOrdering(O); } /*--.. Casts ...............................................................--*/ LLVMValueRef LLVMBuildTrunc(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateTrunc(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildZExt(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateZExt(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildSExt(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateSExt(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildFPToUI(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateFPToUI(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildFPToSI(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateFPToSI(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildUIToFP(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateUIToFP(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildSIToFP(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateSIToFP(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildFPTrunc(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateFPTrunc(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildFPExt(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateFPExt(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildPtrToInt(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreatePtrToInt(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildIntToPtr(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateIntToPtr(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildBitCast(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateBitCast(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildAddrSpaceCast(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateAddrSpaceCast(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildZExtOrBitCast(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateZExtOrBitCast(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildSExtOrBitCast(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateSExtOrBitCast(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildTruncOrBitCast(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateTruncOrBitCast(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildCast(LLVMBuilderRef B, LLVMOpcode Op, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateCast(Instruction::CastOps(map_from_llvmopcode(Op)), unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildPointerCast(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreatePointerCast(unwrap(Val), unwrap(DestTy), Name)); } LLVMValueRef LLVMBuildIntCast2(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, LLVMBool IsSigned, const char *Name) { return wrap( unwrap(B)->CreateIntCast(unwrap(Val), unwrap(DestTy), IsSigned, Name)); } LLVMValueRef LLVMBuildIntCast(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateIntCast(unwrap(Val), unwrap(DestTy), /*isSigned*/true, Name)); } LLVMValueRef LLVMBuildFPCast(LLVMBuilderRef B, LLVMValueRef Val, LLVMTypeRef DestTy, const char *Name) { return wrap(unwrap(B)->CreateFPCast(unwrap(Val), unwrap(DestTy), Name)); } /*--.. Comparisons .........................................................--*/ LLVMValueRef LLVMBuildICmp(LLVMBuilderRef B, LLVMIntPredicate Op, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateICmp(static_cast(Op), unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildFCmp(LLVMBuilderRef B, LLVMRealPredicate Op, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreateFCmp(static_cast(Op), unwrap(LHS), unwrap(RHS), Name)); } /*--.. Miscellaneous instructions ..........................................--*/ LLVMValueRef LLVMBuildPhi(LLVMBuilderRef B, LLVMTypeRef Ty, const char *Name) { return wrap(unwrap(B)->CreatePHI(unwrap(Ty), 0, Name)); } LLVMValueRef LLVMBuildCall(LLVMBuilderRef B, LLVMValueRef Fn, LLVMValueRef *Args, unsigned NumArgs, const char *Name) { Value *V = unwrap(Fn); FunctionType *FnT = cast(cast(V->getType())->getElementType()); return wrap(unwrap(B)->CreateCall(FnT, unwrap(Fn), makeArrayRef(unwrap(Args), NumArgs), Name)); } LLVMValueRef LLVMBuildCall2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn, LLVMValueRef *Args, unsigned NumArgs, const char *Name) { FunctionType *FTy = unwrap(Ty); return wrap(unwrap(B)->CreateCall(FTy, unwrap(Fn), makeArrayRef(unwrap(Args), NumArgs), Name)); } LLVMValueRef LLVMBuildSelect(LLVMBuilderRef B, LLVMValueRef If, LLVMValueRef Then, LLVMValueRef Else, const char *Name) { return wrap(unwrap(B)->CreateSelect(unwrap(If), unwrap(Then), unwrap(Else), Name)); } LLVMValueRef LLVMBuildVAArg(LLVMBuilderRef B, LLVMValueRef List, LLVMTypeRef Ty, const char *Name) { return wrap(unwrap(B)->CreateVAArg(unwrap(List), unwrap(Ty), Name)); } LLVMValueRef LLVMBuildExtractElement(LLVMBuilderRef B, LLVMValueRef VecVal, LLVMValueRef Index, const char *Name) { return wrap(unwrap(B)->CreateExtractElement(unwrap(VecVal), unwrap(Index), Name)); } LLVMValueRef LLVMBuildInsertElement(LLVMBuilderRef B, LLVMValueRef VecVal, LLVMValueRef EltVal, LLVMValueRef Index, const char *Name) { return wrap(unwrap(B)->CreateInsertElement(unwrap(VecVal), unwrap(EltVal), unwrap(Index), Name)); } LLVMValueRef LLVMBuildShuffleVector(LLVMBuilderRef B, LLVMValueRef V1, LLVMValueRef V2, LLVMValueRef Mask, const char *Name) { return wrap(unwrap(B)->CreateShuffleVector(unwrap(V1), unwrap(V2), unwrap(Mask), Name)); } LLVMValueRef LLVMBuildExtractValue(LLVMBuilderRef B, LLVMValueRef AggVal, unsigned Index, const char *Name) { return wrap(unwrap(B)->CreateExtractValue(unwrap(AggVal), Index, Name)); } LLVMValueRef LLVMBuildInsertValue(LLVMBuilderRef B, LLVMValueRef AggVal, LLVMValueRef EltVal, unsigned Index, const char *Name) { return wrap(unwrap(B)->CreateInsertValue(unwrap(AggVal), unwrap(EltVal), Index, Name)); } LLVMValueRef LLVMBuildIsNull(LLVMBuilderRef B, LLVMValueRef Val, const char *Name) { return wrap(unwrap(B)->CreateIsNull(unwrap(Val), Name)); } LLVMValueRef LLVMBuildIsNotNull(LLVMBuilderRef B, LLVMValueRef Val, const char *Name) { return wrap(unwrap(B)->CreateIsNotNull(unwrap(Val), Name)); } LLVMValueRef LLVMBuildPtrDiff(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *Name) { return wrap(unwrap(B)->CreatePtrDiff(unwrap(LHS), unwrap(RHS), Name)); } LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op, LLVMValueRef PTR, LLVMValueRef Val, LLVMAtomicOrdering ordering, LLVMBool singleThread) { AtomicRMWInst::BinOp intop; switch (op) { case LLVMAtomicRMWBinOpXchg: intop = AtomicRMWInst::Xchg; break; case LLVMAtomicRMWBinOpAdd: intop = AtomicRMWInst::Add; break; case LLVMAtomicRMWBinOpSub: intop = AtomicRMWInst::Sub; break; case LLVMAtomicRMWBinOpAnd: intop = AtomicRMWInst::And; break; case LLVMAtomicRMWBinOpNand: intop = AtomicRMWInst::Nand; break; case LLVMAtomicRMWBinOpOr: intop = AtomicRMWInst::Or; break; case LLVMAtomicRMWBinOpXor: intop = AtomicRMWInst::Xor; break; case LLVMAtomicRMWBinOpMax: intop = AtomicRMWInst::Max; break; case LLVMAtomicRMWBinOpMin: intop = AtomicRMWInst::Min; break; case LLVMAtomicRMWBinOpUMax: intop = AtomicRMWInst::UMax; break; case LLVMAtomicRMWBinOpUMin: intop = AtomicRMWInst::UMin; break; } return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val), mapFromLLVMOrdering(ordering), singleThread ? SyncScope::SingleThread : SyncScope::System)); } LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr, LLVMValueRef Cmp, LLVMValueRef New, LLVMAtomicOrdering SuccessOrdering, LLVMAtomicOrdering FailureOrdering, LLVMBool singleThread) { return wrap(unwrap(B)->CreateAtomicCmpXchg(unwrap(Ptr), unwrap(Cmp), unwrap(New), mapFromLLVMOrdering(SuccessOrdering), mapFromLLVMOrdering(FailureOrdering), singleThread ? SyncScope::SingleThread : SyncScope::System)); } LLVMBool LLVMIsAtomicSingleThread(LLVMValueRef AtomicInst) { Value *P = unwrap(AtomicInst); if (AtomicRMWInst *I = dyn_cast(P)) return I->getSyncScopeID() == SyncScope::SingleThread; return cast(P)->getSyncScopeID() == SyncScope::SingleThread; } void LLVMSetAtomicSingleThread(LLVMValueRef AtomicInst, LLVMBool NewValue) { Value *P = unwrap(AtomicInst); SyncScope::ID SSID = NewValue ? SyncScope::SingleThread : SyncScope::System; if (AtomicRMWInst *I = dyn_cast(P)) return I->setSyncScopeID(SSID); return cast(P)->setSyncScopeID(SSID); } LLVMAtomicOrdering LLVMGetCmpXchgSuccessOrdering(LLVMValueRef CmpXchgInst) { Value *P = unwrap(CmpXchgInst); return mapToLLVMOrdering(cast(P)->getSuccessOrdering()); } void LLVMSetCmpXchgSuccessOrdering(LLVMValueRef CmpXchgInst, LLVMAtomicOrdering Ordering) { Value *P = unwrap(CmpXchgInst); AtomicOrdering O = mapFromLLVMOrdering(Ordering); return cast(P)->setSuccessOrdering(O); } LLVMAtomicOrdering LLVMGetCmpXchgFailureOrdering(LLVMValueRef CmpXchgInst) { Value *P = unwrap(CmpXchgInst); return mapToLLVMOrdering(cast(P)->getFailureOrdering()); } void LLVMSetCmpXchgFailureOrdering(LLVMValueRef CmpXchgInst, LLVMAtomicOrdering Ordering) { Value *P = unwrap(CmpXchgInst); AtomicOrdering O = mapFromLLVMOrdering(Ordering); return cast(P)->setFailureOrdering(O); } /*===-- Module providers --------------------------------------------------===*/ LLVMModuleProviderRef LLVMCreateModuleProviderForExistingModule(LLVMModuleRef M) { return reinterpret_cast(M); } void LLVMDisposeModuleProvider(LLVMModuleProviderRef MP) { delete unwrap(MP); } /*===-- Memory buffers ----------------------------------------------------===*/ LLVMBool LLVMCreateMemoryBufferWithContentsOfFile( const char *Path, LLVMMemoryBufferRef *OutMemBuf, char **OutMessage) { ErrorOr> MBOrErr = MemoryBuffer::getFile(Path); if (std::error_code EC = MBOrErr.getError()) { *OutMessage = strdup(EC.message().c_str()); return 1; } *OutMemBuf = wrap(MBOrErr.get().release()); return 0; } LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf, char **OutMessage) { ErrorOr> MBOrErr = MemoryBuffer::getSTDIN(); if (std::error_code EC = MBOrErr.getError()) { *OutMessage = strdup(EC.message().c_str()); return 1; } *OutMemBuf = wrap(MBOrErr.get().release()); return 0; } LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRange( const char *InputData, size_t InputDataLength, const char *BufferName, LLVMBool RequiresNullTerminator) { return wrap(MemoryBuffer::getMemBuffer(StringRef(InputData, InputDataLength), StringRef(BufferName), RequiresNullTerminator).release()); } LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRangeCopy( const char *InputData, size_t InputDataLength, const char *BufferName) { return wrap( MemoryBuffer::getMemBufferCopy(StringRef(InputData, InputDataLength), StringRef(BufferName)).release()); } const char *LLVMGetBufferStart(LLVMMemoryBufferRef MemBuf) { return unwrap(MemBuf)->getBufferStart(); } size_t LLVMGetBufferSize(LLVMMemoryBufferRef MemBuf) { return unwrap(MemBuf)->getBufferSize(); } void LLVMDisposeMemoryBuffer(LLVMMemoryBufferRef MemBuf) { delete unwrap(MemBuf); } /*===-- Pass Registry -----------------------------------------------------===*/ LLVMPassRegistryRef LLVMGetGlobalPassRegistry(void) { return wrap(PassRegistry::getPassRegistry()); } /*===-- Pass Manager ------------------------------------------------------===*/ LLVMPassManagerRef LLVMCreatePassManager() { return wrap(new legacy::PassManager()); } LLVMPassManagerRef LLVMCreateFunctionPassManagerForModule(LLVMModuleRef M) { return wrap(new legacy::FunctionPassManager(unwrap(M))); } LLVMPassManagerRef LLVMCreateFunctionPassManager(LLVMModuleProviderRef P) { return LLVMCreateFunctionPassManagerForModule( reinterpret_cast(P)); } LLVMBool LLVMRunPassManager(LLVMPassManagerRef PM, LLVMModuleRef M) { return unwrap(PM)->run(*unwrap(M)); } LLVMBool LLVMInitializeFunctionPassManager(LLVMPassManagerRef FPM) { return unwrap(FPM)->doInitialization(); } LLVMBool LLVMRunFunctionPassManager(LLVMPassManagerRef FPM, LLVMValueRef F) { return unwrap(FPM)->run(*unwrap(F)); } LLVMBool LLVMFinalizeFunctionPassManager(LLVMPassManagerRef FPM) { return unwrap(FPM)->doFinalization(); } void LLVMDisposePassManager(LLVMPassManagerRef PM) { delete unwrap(PM); } /*===-- Threading ------------------------------------------------------===*/ LLVMBool LLVMStartMultithreaded() { return LLVMIsMultithreaded(); } void LLVMStopMultithreaded() { } LLVMBool LLVMIsMultithreaded() { return llvm_is_multithreaded(); } Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 351709) @@ -1,12083 +1,12091 @@ //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the AArch64TargetLowering class. // //===----------------------------------------------------------------------===// #include "AArch64ExpandImm.h" #include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64PerfectShuffle.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include #include #include #include #include #include #include #include using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-lower" STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); static cl::opt EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, cl::desc("Allow AArch64 SLI/SRI formation"), cl::init(false)); // FIXME: The necessary dtprel relocations don't seem to be supported // well in the GNU bfd and gold linkers at the moment. Therefore, by // default, for now, fall back to GeneralDynamic code generation. cl::opt EnableAArch64ELFLocalDynamicTLSGeneration( "aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); static cl::opt EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true)); /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so // we have to make something up. Arbitrarily, choose ZeroOrOne. setBooleanContents(ZeroOrOneBooleanContent); // When comparing vectors the result sets the different elements in the // vector to all-one or all-zero. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Set up the register classes. addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); if (Subtarget->hasFPARMv8()) { addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); } if (Subtarget->hasNEON()) { addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); // Someone set us up the NEON. addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); addDRTypeForNEON(MVT::v4i16); addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); addDRTypeForNEON(MVT::v4f16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); addQRTypeForNEON(MVT::v16i8); addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); addQRTypeForNEON(MVT::v8f16); } // Compute derived properties from the register classes computeRegisterProperties(Subtarget->getRegisterInfo()); // Provide all sorts of operation actions setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::i32, Custom); setOperationAction(ISD::SETCC, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); setOperationAction(ISD::BR_CC, MVT::f16, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::BR_JT, MVT::Other, Custom); setOperationAction(ISD::JumpTable, MVT::i64, Custom); setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); setOperationAction(ISD::FREM, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f80, Expand); setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); // Custom lowering hooks are needed for XOR // to fold it into CSINC/CSINV. setOperationAction(ISD::XOR, MVT::i32, Custom); setOperationAction(ISD::XOR, MVT::i64, Custom); // Virtually no operation on f128 is legal, but LLVM can't expand them when // there's a valid register class, so we need custom operations in most cases. setOperationAction(ISD::FABS, MVT::f128, Expand); setOperationAction(ISD::FADD, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); setOperationAction(ISD::FCOS, MVT::f128, Expand); setOperationAction(ISD::FDIV, MVT::f128, Custom); setOperationAction(ISD::FMA, MVT::f128, Expand); setOperationAction(ISD::FMUL, MVT::f128, Custom); setOperationAction(ISD::FNEG, MVT::f128, Expand); setOperationAction(ISD::FPOW, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); setOperationAction(ISD::FRINT, MVT::f128, Expand); setOperationAction(ISD::FSIN, MVT::f128, Expand); setOperationAction(ISD::FSINCOS, MVT::f128, Expand); setOperationAction(ISD::FSQRT, MVT::f128, Expand); setOperationAction(ISD::FSUB, MVT::f128, Custom); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); setOperationAction(ISD::BR_CC, MVT::f128, Custom); setOperationAction(ISD::SELECT, MVT::f128, Custom); setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); // Lowering for many of the conversions is actually specified by the non-f128 // type. The LowerXXX function will be trivial when f128 isn't involved. setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); // Variable arguments. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VACOPY, MVT::Other, Custom); setOperationAction(ISD::VAEND, MVT::Other, Expand); // Variable-sized objects. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); if (Subtarget->isTargetWindows()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); // BlockAddress setOperationAction(ISD::BlockAddress, MVT::i64, Custom); // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. setOperationAction(ISD::ADDC, MVT::i32, Custom); setOperationAction(ISD::ADDE, MVT::i32, Custom); setOperationAction(ISD::SUBC, MVT::i32, Custom); setOperationAction(ISD::SUBE, MVT::i32, Custom); setOperationAction(ISD::ADDC, MVT::i64, Custom); setOperationAction(ISD::ADDE, MVT::i64, Custom); setOperationAction(ISD::SUBC, MVT::i64, Custom); setOperationAction(ISD::SUBE, MVT::i64, Custom); // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); // Custom lower Add/Sub/Mul with overflow. setOperationAction(ISD::SADDO, MVT::i32, Custom); setOperationAction(ISD::SADDO, MVT::i64, Custom); setOperationAction(ISD::UADDO, MVT::i32, Custom); setOperationAction(ISD::UADDO, MVT::i64, Custom); setOperationAction(ISD::SSUBO, MVT::i32, Custom); setOperationAction(ISD::SSUBO, MVT::i64, Custom); setOperationAction(ISD::USUBO, MVT::i32, Custom); setOperationAction(ISD::USUBO, MVT::i64, Custom); setOperationAction(ISD::SMULO, MVT::i32, Custom); setOperationAction(ISD::SMULO, MVT::i64, Custom); setOperationAction(ISD::UMULO, MVT::i32, Custom); setOperationAction(ISD::UMULO, MVT::i64, Custom); setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); if (Subtarget->hasFullFP16()) setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); else setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::FREM, MVT::v4f16, Expand); setOperationAction(ISD::FREM, MVT::v8f16, Expand); setOperationAction(ISD::FPOW, MVT::f16, Promote); setOperationAction(ISD::FPOW, MVT::v4f16, Expand); setOperationAction(ISD::FPOW, MVT::v8f16, Expand); setOperationAction(ISD::FPOWI, MVT::f16, Promote); setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); setOperationAction(ISD::FCOS, MVT::f16, Promote); setOperationAction(ISD::FCOS, MVT::v4f16, Expand); setOperationAction(ISD::FCOS, MVT::v8f16, Expand); setOperationAction(ISD::FSIN, MVT::f16, Promote); setOperationAction(ISD::FSIN, MVT::v4f16, Expand); setOperationAction(ISD::FSIN, MVT::v8f16, Expand); setOperationAction(ISD::FSINCOS, MVT::f16, Promote); setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); setOperationAction(ISD::FEXP, MVT::f16, Promote); setOperationAction(ISD::FEXP, MVT::v4f16, Expand); setOperationAction(ISD::FEXP, MVT::v8f16, Expand); setOperationAction(ISD::FEXP2, MVT::f16, Promote); setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); setOperationAction(ISD::FLOG, MVT::f16, Promote); setOperationAction(ISD::FLOG, MVT::v4f16, Expand); setOperationAction(ISD::FLOG, MVT::v8f16, Expand); setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); setOperationAction(ISD::FLOG10, MVT::f16, Promote); setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); if (!Subtarget->hasFullFP16()) { setOperationAction(ISD::SELECT, MVT::f16, Promote); setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); setOperationAction(ISD::SETCC, MVT::f16, Promote); setOperationAction(ISD::BR_CC, MVT::f16, Promote); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); setOperationAction(ISD::FDIV, MVT::f16, Promote); setOperationAction(ISD::FMA, MVT::f16, Promote); setOperationAction(ISD::FNEG, MVT::f16, Promote); setOperationAction(ISD::FABS, MVT::f16, Promote); setOperationAction(ISD::FCEIL, MVT::f16, Promote); setOperationAction(ISD::FSQRT, MVT::f16, Promote); setOperationAction(ISD::FFLOOR, MVT::f16, Promote); setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); setOperationAction(ISD::FRINT, MVT::f16, Promote); setOperationAction(ISD::FROUND, MVT::f16, Promote); setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FMINNUM, MVT::f16, Promote); setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); // promote v4f16 to v4f32 when that is known to be safe. setOperationAction(ISD::FADD, MVT::v4f16, Promote); setOperationAction(ISD::FSUB, MVT::v4f16, Promote); setOperationAction(ISD::FMUL, MVT::v4f16, Promote); setOperationAction(ISD::FDIV, MVT::v4f16, Promote); setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); setOperationAction(ISD::FABS, MVT::v4f16, Expand); setOperationAction(ISD::FNEG, MVT::v4f16, Expand); setOperationAction(ISD::FROUND, MVT::v4f16, Expand); setOperationAction(ISD::FMA, MVT::v4f16, Expand); setOperationAction(ISD::SETCC, MVT::v4f16, Expand); setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); setOperationAction(ISD::SELECT, MVT::v4f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); setOperationAction(ISD::FRINT, MVT::v4f16, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); setOperationAction(ISD::FABS, MVT::v8f16, Expand); setOperationAction(ISD::FADD, MVT::v8f16, Expand); setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); setOperationAction(ISD::FDIV, MVT::v8f16, Expand); setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); setOperationAction(ISD::FMA, MVT::v8f16, Expand); setOperationAction(ISD::FMUL, MVT::v8f16, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); setOperationAction(ISD::FNEG, MVT::v8f16, Expand); setOperationAction(ISD::FROUND, MVT::v8f16, Expand); setOperationAction(ISD::FRINT, MVT::v8f16, Expand); setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); setOperationAction(ISD::FSUB, MVT::v8f16, Expand); setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); setOperationAction(ISD::SETCC, MVT::v8f16, Expand); setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); setOperationAction(ISD::SELECT, MVT::v8f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); } // AArch64 has implementations of a lot of rounding-like FP operations. for (MVT Ty : {MVT::f32, MVT::f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); setOperationAction(ISD::FMINNUM, Ty, Legal); setOperationAction(ISD::FMAXNUM, Ty, Legal); setOperationAction(ISD::FMINIMUM, Ty, Legal); setOperationAction(ISD::FMAXIMUM, Ty, Legal); setOperationAction(ISD::LROUND, Ty, Legal); setOperationAction(ISD::LLROUND, Ty, Legal); setOperationAction(ISD::LRINT, Ty, Legal); setOperationAction(ISD::LLRINT, Ty, Legal); } if (Subtarget->hasFullFP16()) { setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); setOperationAction(ISD::FFLOOR, MVT::f16, Legal); setOperationAction(ISD::FCEIL, MVT::f16, Legal); setOperationAction(ISD::FRINT, MVT::f16, Legal); setOperationAction(ISD::FTRUNC, MVT::f16, Legal); setOperationAction(ISD::FROUND, MVT::f16, Legal); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { // Issue __sincos_stret if available. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } else { setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { setOperationAction(ISD::ConstantFP, MVT::f32, Legal); setOperationAction(ISD::ConstantFP, MVT::f64, Legal); } // AArch64 does not have floating-point extending loads, i1 sign-extending // load, floating-point truncating stores, or v2i32->v2i16 truncating store. for (MVT VT : MVT::fp_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); } for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f128, MVT::f80, Expand); setTruncStoreAction(MVT::f128, MVT::f64, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, MVT::i8, Legal); setIndexedLoadAction(im, MVT::i16, Legal); setIndexedLoadAction(im, MVT::i32, Legal); setIndexedLoadAction(im, MVT::i64, Legal); setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); setIndexedLoadAction(im, MVT::f16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i64, Legal); setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); setIndexedStoreAction(im, MVT::f16, Legal); } // Trap. setOperationAction(ISD::TRAP, MVT::Other, Legal); if (Subtarget->isTargetWindows()) setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // We combine OR nodes for bitfield operations. setTargetDAGCombine(ISD::OR); // Try to create BICs for vector ANDs. setTargetDAGCombine(ISD::AND); // Vector add and sub nodes may conceal a high-half opportunity. // Also, try to fold ADD into CSINC/CSINV.. setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::GlobalAddress); // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemset = Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32; MaxGluedStoresPerMemcpy = 4; MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemcpy = Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16; MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; MaxLoadsPerMemcmpOptSize = 4; MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8; setStackPointerRegisterToSaveRestore(AArch64::SP); setSchedulingPreference(Sched::Hybrid); EnableExtLdPromotion = true; // Set required alignment. setMinFunctionAlignment(2); // Set preferred alignments. setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); setPrefLoopAlignment(STI.getPrefLoopAlignment()); // Only change the limit for entries in a jump table if specified by // the sub target, but not at the command line. unsigned MaxJT = STI.getMaximumJumpTableSize(); if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) setMaximumJumpTableSize(MaxJT); setHasExtractBitsInsn(true); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); if (Subtarget->hasNEON()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: setOperationAction(ISD::FABS, MVT::v1f64, Expand); setOperationAction(ISD::FADD, MVT::v1f64, Expand); setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); setOperationAction(ISD::FCOS, MVT::v1f64, Expand); setOperationAction(ISD::FDIV, MVT::v1f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); setOperationAction(ISD::FMA, MVT::v1f64, Expand); setOperationAction(ISD::FMUL, MVT::v1f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); setOperationAction(ISD::FNEG, MVT::v1f64, Expand); setOperationAction(ISD::FPOW, MVT::v1f64, Expand); setOperationAction(ISD::FREM, MVT::v1f64, Expand); setOperationAction(ISD::FROUND, MVT::v1f64, Expand); setOperationAction(ISD::FRINT, MVT::v1f64, Expand); setOperationAction(ISD::FSIN, MVT::v1f64, Expand); setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); setOperationAction(ISD::FSUB, MVT::v1f64, Expand); setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); setOperationAction(ISD::SETCC, MVT::v1f64, Expand); setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); setOperationAction(ISD::SELECT, MVT::v1f64, Expand); setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); setOperationAction(ISD::MUL, MVT::v1i64, Expand); // AArch64 doesn't have a direct vector ->f32 conversion instructions for // elements smaller than i32, so promote the input to i32 first. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); // i8 vector elements also need promotion to i32 for v8i8 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); // Similarly, there is no direct i32 -> f64 vector conversion instruction. setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); // Or, direct i32 -> f16 vector conversion. Set it so custom, so the // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); if (Subtarget->hasFullFP16()) { setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); } else { // when AArch64 doesn't have fullfp16 support, promote the input // to i32 first. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); } setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); // Custom handling for some quad-vector types to detect MULL. setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); // Vector reductions for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); } for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); } setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { setOperationAction(ISD::MULHS, VT, Legal); setOperationAction(ISD::MULHU, VT, Legal); } else { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); } setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } // AArch64 has implementations of a lot of rounding-like FP operations. for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); } if (Subtarget->hasFullFP16()) { for (MVT Ty : {MVT::v4f16, MVT::v8f16}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); } } setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { assert(VT.isVector() && "VT should be a vector type"); if (VT.isFloatingPoint()) { MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); setOperationPromotedToType(ISD::STORE, VT, PromoteTo); } // Mark vector float intrinsics as expand. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); // But we do support custom-lowering for FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, VT, Custom); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); for (MVT InnerVT : MVT::all_valuetypes()) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // CNT supports only B element sizes, then use UADDLP to widen. if (VT != MVT::v8i8 && VT != MVT::v16i8) setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); if (!VT.isFloatingPoint()) setOperationAction(ISD::ABS, VT, Legal); // [SU][MIN|MAX] are available for all NEON types apart from i64. if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); // F[MIN|MAX][NUM|NAN] are available for all FP NEON types. if (VT.isFloatingPoint() && (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) for (unsigned Opcode : {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM}) setOperationAction(Opcode, VT, Legal); if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT, Legal); setIndexedStoreAction(im, VT, Legal); } } } void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); addTypeForNEON(VT, MVT::v2i32); } void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR128RegClass); addTypeForNEON(VT, MVT::v4i32); } EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); } static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc) { uint64_t OldImm = Imm, NewImm, Enc; uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; // Return if the immediate is already all zeros, all ones, a bimm32 or a // bimm64. if (Imm == 0 || Imm == Mask || AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) return false; unsigned EltSize = Size; uint64_t DemandedBits = Demanded.getZExtValue(); // Clear bits that are not demanded. Imm &= DemandedBits; while (true) { // The goal here is to set the non-demanded bits in a way that minimizes // the number of switching between 0 and 1. In order to achieve this goal, // we set the non-demanded bits to the value of the preceding demanded bits. // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a // non-demanded bit), we copy bit0 (1) to the least significant 'x', // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. // The final result is 0b11000011. uint64_t NonDemandedBits = ~DemandedBits; uint64_t InvertedImm = ~Imm & DemandedBits; uint64_t RotatedImm = ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & NonDemandedBits; uint64_t Sum = RotatedImm + NonDemandedBits; bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); uint64_t Ones = (Sum + Carry) & NonDemandedBits; NewImm = (Imm | Ones) & Mask; // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate // or all-ones or all-zeros, in which case we can stop searching. Otherwise, // we halve the element size and continue the search. if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) break; // We cannot shrink the element size any further if it is 2-bits. if (EltSize == 2) return false; EltSize /= 2; Mask >>= EltSize; uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; // Return if there is mismatch in any of the demanded bits of Imm and Hi. if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) return false; // Merge the upper and lower halves of Imm and DemandedBits. Imm |= Hi; DemandedBits |= DemandedBitsHi; } ++NumOptimizedImms; // Replicate the element across the register width. while (EltSize < Size) { NewImm |= NewImm << EltSize; EltSize *= 2; } (void)OldImm; assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && "demanded bits should never be altered"); assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); // Create the new constant immediate node. EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue New; // If the new constant immediate is all-zeros or all-ones, let the target // independent DAG combine optimize this node. if (NewImm == 0 || NewImm == OrigMask) { New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), TLO.DAG.getConstant(NewImm, DL, VT)); // Otherwise, create a machine node so that target independent DAG combine // doesn't undo this optimization. } else { Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); New = SDValue( TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); } return TLO.CombineTo(Op, New); } bool AArch64TargetLowering::targetShrinkDemandedConstant( SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { // Delay this optimization to as late as possible. if (!TLO.LegalOps) return false; if (!EnableOptimizeLogicalImm) return false; EVT VT = Op.getValueType(); if (VT.isVector()) return false; unsigned Size = VT.getSizeInBits(); assert((Size == 32 || Size == 64) && "i32 or i64 is expected after legalization."); // Exit early if we demand all bits. if (Demanded.countPopulation() == Size) return false; unsigned NewOpc; switch (Op.getOpcode()) { default: return false; case ISD::AND: NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; break; case ISD::OR: NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; break; case ISD::XOR: NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; break; } ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) return false; uint64_t Imm = C->getZExtValue(); return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); } /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them Known. void AArch64TargetLowering::computeKnownBitsForTargetNode( const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { switch (Op.getOpcode()) { default: break; case AArch64ISD::CSEL: { KnownBits Known2; Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); Known.Zero &= Known2.Zero; Known.One &= Known2.One; break; } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast(Op->getOperand(1)); Intrinsic::ID IntID = static_cast(CN->getZExtValue()); switch (IntID) { default: return; case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { unsigned BitWidth = Known.getBitWidth(); EVT VT = cast(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } break; } case ISD::INTRINSIC_WO_CHAIN: case ISD::INTRINSIC_VOID: { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); switch (IntNo) { default: break; case Intrinsic::aarch64_neon_umaxv: case Intrinsic::aarch64_neon_uminv: { // Figure out the datatype of the vector operand. The UMINV instruction // will zero extend the result, so we can mark as known zero all the // bits larger than the element datatype. 32-bit or larget doesn't need // this as those are legal types and will be handled by isel directly. MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); unsigned BitWidth = Known.getBitWidth(); if (VT == MVT::v8i8 || VT == MVT::v16i8) { assert(BitWidth >= 8 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); Known.Zero |= Mask; } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { assert(BitWidth >= 16 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); Known.Zero |= Mask; } break; } break; } } } } MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, EVT) const { return MVT::i64; } bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const { if (Subtarget->requiresStrictAlign()) return false; if (Fast) { // Some CPUs are fine with unaligned stores except for 128-bit ones. *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || // See comments in performSTORECombine() for more details about // these conditions. // Code that uses clang vector extensions can mark that it // wants unaligned accesses to be treated as fast by // underspecifying alignment to be 1 or 2. Align <= 2 || // Disregard v2i64. Memcpy lowering produces those and splitting // them regresses performance on micro-benchmarks and olden/bh. VT == MVT::v2i64; } return true; } FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return AArch64::createFastISel(funcInfo, libInfo); } const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; case AArch64ISD::CALL: return "AArch64ISD::CALL"; case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; case AArch64ISD::ADR: return "AArch64ISD::ADR"; case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; case AArch64ISD::ADC: return "AArch64ISD::ADC"; case AArch64ISD::SBC: return "AArch64ISD::SBC"; case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; case AArch64ISD::DUP: return "AArch64ISD::DUP"; case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; case AArch64ISD::BICi: return "AArch64ISD::BICi"; case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; case AArch64ISD::BSL: return "AArch64ISD::BSL"; case AArch64ISD::NEG: return "AArch64ISD::NEG"; case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; case AArch64ISD::REV16: return "AArch64ISD::REV16"; case AArch64ISD::REV32: return "AArch64ISD::REV32"; case AArch64ISD::REV64: return "AArch64ISD::REV64"; case AArch64ISD::EXT: return "AArch64ISD::EXT"; case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; case AArch64ISD::NOT: return "AArch64ISD::NOT"; case AArch64ISD::BIT: return "AArch64ISD::BIT"; case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS"; case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS"; case AArch64ISD::STG: return "AArch64ISD::STG"; case AArch64ISD::STZG: return "AArch64ISD::STZG"; case AArch64ISD::ST2G: return "AArch64ISD::ST2G"; case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G"; } return nullptr; } MachineBasicBlock * AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *MBB) const { // We materialise the F128CSEL pseudo-instruction as some control flow and a // phi node: // OrigBB: // [... previous instrs leading to comparison ...] // b.ne TrueBB // b EndBB // TrueBB: // ; Fallthrough // EndBB: // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); unsigned DestReg = MI.getOperand(0).getReg(); unsigned IfTrueReg = MI.getOperand(1).getReg(); unsigned IfFalseReg = MI.getOperand(2).getReg(); unsigned CondCode = MI.getOperand(3).getImm(); bool NZCVKilled = MI.getOperand(4).isKill(); MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, TrueBB); MF->insert(It, EndBB); // Transfer rest of current basic-block to EndBB EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); EndBB->transferSuccessorsAndUpdatePHIs(MBB); BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); MBB->addSuccessor(TrueBB); MBB->addSuccessor(EndBB); // TrueBB falls through to the end. TrueBB->addSuccessor(EndBB); if (!NZCVKilled) { TrueBB->addLiveIn(AArch64::NZCV); EndBB->addLiveIn(AArch64::NZCV); } BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) .addReg(IfTrueReg) .addMBB(TrueBB) .addReg(IfFalseReg) .addMBB(MBB); MI.eraseFromParent(); return EndBB; } MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( MachineInstr &MI, MachineBasicBlock *BB) const { assert(!isAsynchronousEHPersonality(classifyEHPersonality( BB->getParent()->getFunction().getPersonalityFn())) && "SEH does not use catchret!"); return BB; } MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad( MachineInstr &MI, MachineBasicBlock *BB) const { MI.eraseFromParent(); return BB; } MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { default: #ifndef NDEBUG MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); case AArch64::CATCHPAD: return EmitLoweredCatchPad(MI, BB); } } //===----------------------------------------------------------------------===// // AArch64 Lowering private implementation. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Lowering Code //===----------------------------------------------------------------------===// /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 /// CC static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { switch (CC) { default: llvm_unreachable("Unknown condition code!"); case ISD::SETNE: return AArch64CC::NE; case ISD::SETEQ: return AArch64CC::EQ; case ISD::SETGT: return AArch64CC::GT; case ISD::SETGE: return AArch64CC::GE; case ISD::SETLT: return AArch64CC::LT; case ISD::SETLE: return AArch64CC::LE; case ISD::SETUGT: return AArch64CC::HI; case ISD::SETUGE: return AArch64CC::HS; case ISD::SETULT: return AArch64CC::LO; case ISD::SETULE: return AArch64CC::LS; } } /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2) { CondCode2 = AArch64CC::AL; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: case ISD::SETOEQ: CondCode = AArch64CC::EQ; break; case ISD::SETGT: case ISD::SETOGT: CondCode = AArch64CC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = AArch64CC::GE; break; case ISD::SETOLT: CondCode = AArch64CC::MI; break; case ISD::SETOLE: CondCode = AArch64CC::LS; break; case ISD::SETONE: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GT; break; case ISD::SETO: CondCode = AArch64CC::VC; break; case ISD::SETUO: CondCode = AArch64CC::VS; break; case ISD::SETUEQ: CondCode = AArch64CC::EQ; CondCode2 = AArch64CC::VS; break; case ISD::SETUGT: CondCode = AArch64CC::HI; break; case ISD::SETUGE: CondCode = AArch64CC::PL; break; case ISD::SETLT: case ISD::SETULT: CondCode = AArch64CC::LT; break; case ISD::SETLE: case ISD::SETULE: CondCode = AArch64CC::LE; break; case ISD::SETNE: case ISD::SETUNE: CondCode = AArch64CC::NE; break; } } /// Convert a DAG fp condition code to an AArch64 CC. /// This differs from changeFPCCToAArch64CC in that it returns cond codes that /// should be AND'ed instead of OR'ed. static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2) { CondCode2 = AArch64CC::AL; switch (CC) { default: changeFPCCToAArch64CC(CC, CondCode, CondCode2); assert(CondCode2 == AArch64CC::AL); break; case ISD::SETONE: // (a one b) // == ((a olt b) || (a ogt b)) // == ((a ord b) && (a une b)) CondCode = AArch64CC::VC; CondCode2 = AArch64CC::NE; break; case ISD::SETUEQ: // (a ueq b) // == ((a uno b) || (a oeq b)) // == ((a ule b) && (a uge b)) CondCode = AArch64CC::PL; CondCode2 = AArch64CC::LE; break; } } /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 /// CC usable with the vector instructions. Fewer operations are available /// without a real NZCV register, so we have to use less efficient combinations /// to get the same effect. static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert) { Invert = false; switch (CC) { default: // Mostly the scalar mappings work fine. changeFPCCToAArch64CC(CC, CondCode, CondCode2); break; case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; case ISD::SETO: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GE; break; case ISD::SETUEQ: case ISD::SETULT: case ISD::SETULE: case ISD::SETUGT: case ISD::SETUGE: // All of the compare-mask comparisons are ordered, but we can switch // between the two by a double inversion. E.g. ULE == !OGT. Invert = true; changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); break; } } static bool isLegalArithImmed(uint64_t C) { // Matches AArch64DAGToDAGISel::SelectArithImmed(). bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); LLVM_DEBUG(dbgs() << "Is imm " << C << " legal: " << (IsLegal ? "yes\n" : "no\n")); return IsLegal; } // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags // can be set differently by this operation. It comes down to whether // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then // everything is fine. If not then the optimization is wrong. Thus general // comparisons are only valid if op2 != 0. // // So, finally, the only LLVM-native comparisons that don't mention C and V // are SETEQ and SETNE. They're the only ones we can safely use CMN for in // the absence of information about op2. static bool isCMN(SDValue Op, ISD::CondCode CC) { return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && (CC == ISD::SETEQ || CC == ISD::SETNE); } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); const bool FullFP16 = static_cast(DAG.getSubtarget()).hasFullFP16(); if (VT.isFloatingPoint()) { assert(VT != MVT::f128); if (VT == MVT::f16 && !FullFP16) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); VT = MVT::f32; } return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); } // The CMP instruction is just an alias for SUBS, and representing it as // SUBS means that it's possible to get CSE with subtract operations. // A later phase can perform the optimization of setting the destination // register to WZR/XZR if it ends up being unused. unsigned Opcode = AArch64ISD::SUBS; if (isCMN(RHS, CC)) { // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); } else if (isCMN(LHS, CC)) { // As we are looking for EQ/NE compares, the operands can be commuted ; can // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? Opcode = AArch64ISD::ADDS; LHS = LHS.getOperand(1); } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { // Similarly, (CMP (and X, Y), 0) can be implemented with a TST // (a.k.a. ANDS) except that the flags are only guaranteed to work for one // of the signed comparisons. Opcode = AArch64ISD::ANDS; RHS = LHS.getOperand(1); LHS = LHS.getOperand(0); } return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) .getValue(1); } /// \defgroup AArch64CCMP CMP;CCMP matching /// /// These functions deal with the formation of CMP;CCMP;... sequences. /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of /// a comparison. They set the NZCV flags to a predefined value if their /// predicate is false. This allows to express arbitrary conjunctions, for /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" /// expressed as: /// cmp A /// ccmp B, inv(CB), CA /// check for CB flags /// /// This naturally lets us implement chains of AND operations with SETCC /// operands. And we can even implement some other situations by transforming /// them: /// - We can implement (NEG SETCC) i.e. negating a single comparison by /// negating the flags used in a CCMP/FCCMP operations. /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations /// by negating the flags we test for afterwards. i.e. /// NEG (CMP CCMP CCCMP ...) can be implemented. /// - Note that we can only ever negate all previously processed results. /// What we can not implement by flipping the flags to test is a negation /// of two sub-trees (because the negation affects all sub-trees emitted so /// far, so the 2nd sub-tree we emit would also affect the first). /// With those tools we can implement some OR operations: /// - (OR (SETCC A) (SETCC B)) can be implemented via: /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) /// - After transforming OR to NEG/AND combinations we may be able to use NEG /// elimination rules from earlier to implement the whole thing as a /// CCMP/FCCMP chain. /// /// As complete example: /// or (or (setCA (cmp A)) (setCB (cmp B))) /// (and (setCC (cmp C)) (setCD (cmp D)))" /// can be reassociated to: /// or (and (setCC (cmp C)) setCD (cmp D)) // (or (setCA (cmp A)) (setCB (cmp B))) /// can be transformed to: /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" /// which can be implemented as: /// cmp C /// ccmp D, inv(CD), CC /// ccmp A, CA, inv(CD) /// ccmp B, CB, inv(CA) /// check for CB flags /// /// A counterexample is "or (and A B) (and C D)" which translates to /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we /// can only implement 1 of the inner (not) operations, but not both! /// @{ /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG) { unsigned Opcode = 0; const bool FullFP16 = static_cast(DAG.getSubtarget()).hasFullFP16(); if (LHS.getValueType().isFloatingPoint()) { assert(LHS.getValueType() != MVT::f128); if (LHS.getValueType() == MVT::f16 && !FullFP16) { LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); } Opcode = AArch64ISD::FCCMP; } else if (RHS.getOpcode() == ISD::SUB) { SDValue SubOp0 = RHS.getOperand(0); if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // See emitComparison() on why we can only do this for SETEQ and SETNE. Opcode = AArch64ISD::CCMN; RHS = RHS.getOperand(1); } } if (Opcode == 0) Opcode = AArch64ISD::CCMP; SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); } /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be /// expressed as a conjunction. See \ref AArch64CCMP. /// \param CanNegate Set to true if we can negate the whole sub-tree just by /// changing the conditions on the SETCC tests. /// (this means we can call emitConjunctionRec() with /// Negate==true on this sub-tree) /// \param MustBeFirst Set to true if this subtree needs to be negated and we /// cannot do the negation naturally. We are required to /// emit the subtree first in this case. /// \param WillNegate Is true if are called when the result of this /// subexpression must be negated. This happens when the /// outer expression is an OR. We can use this fact to know /// that we have a double negation (or (or ...) ...) that /// can be implemented for free. static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth = 0) { if (!Val.hasOneUse()) return false; unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { if (Val->getOperand(0).getValueType() == MVT::f128) return false; CanNegate = true; MustBeFirst = false; return true; } // Protect against exponential runtime and stack overflow. if (Depth > 6) return false; if (Opcode == ISD::AND || Opcode == ISD::OR) { bool IsOR = Opcode == ISD::OR; SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); bool CanNegateL; bool MustBeFirstL; if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) return false; bool CanNegateR; bool MustBeFirstR; if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) return false; if (MustBeFirstL && MustBeFirstR) return false; if (IsOR) { // For an OR expression we need to be able to naturally negate at least // one side or we cannot do the transformation at all. if (!CanNegateL && !CanNegateR) return false; // If we the result of the OR will be negated and we can naturally negate // the leafs, then this sub-tree as a whole negates naturally. CanNegate = WillNegate && CanNegateL && CanNegateR; // If we cannot naturally negate the whole sub-tree, then this must be // emitted first. MustBeFirst = !CanNegate; } else { assert(Opcode == ISD::AND && "Must be OR or AND"); // We cannot naturally negate an AND operation. CanNegate = false; MustBeFirst = MustBeFirstL || MustBeFirstR; } return true; } return false; } /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain /// of CCMP/CFCMP ops. See @ref AArch64CCMP. /// Tries to transform the given i1 producing node @p Val to a series compare /// and conditional compare operations. @returns an NZCV flags producing node /// and sets @p OutCC to the flags that should be tested or returns SDValue() if /// transformation was not possible. /// \p Negate is true if we want this sub-tree being negated just by changing /// SETCC conditions. static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate) { // We're at a tree leaf, produce a conditional comparison operation. unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { SDValue LHS = Val->getOperand(0); SDValue RHS = Val->getOperand(1); ISD::CondCode CC = cast(Val->getOperand(2))->get(); bool isInteger = LHS.getValueType().isInteger(); if (Negate) CC = getSetCCInverse(CC, isInteger); SDLoc DL(Val); // Determine OutCC and handle FP special case. if (isInteger) { OutCC = changeIntCCToAArch64CC(CC); } else { assert(LHS.getValueType().isFloatingPoint()); AArch64CC::CondCode ExtraCC; changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); // Some floating point conditions can't be tested with a single condition // code. Construct an additional comparison in this case. if (ExtraCC != AArch64CC::AL) { SDValue ExtraCmp; if (!CCOp.getNode()) ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); else ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, ExtraCC, DL, DAG); CCOp = ExtraCmp; Predicate = ExtraCC; } } // Produce a normal comparison if we are first in the chain if (!CCOp) return emitComparison(LHS, RHS, CC, DL, DAG); // Otherwise produce a ccmp. return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, DAG); } assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); bool IsOR = Opcode == ISD::OR; SDValue LHS = Val->getOperand(0); bool CanNegateL; bool MustBeFirstL; bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); assert(ValidL && "Valid conjunction/disjunction tree"); (void)ValidL; SDValue RHS = Val->getOperand(1); bool CanNegateR; bool MustBeFirstR; bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); assert(ValidR && "Valid conjunction/disjunction tree"); (void)ValidR; // Swap sub-tree that must come first to the right side. if (MustBeFirstL) { assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); std::swap(LHS, RHS); std::swap(CanNegateL, CanNegateR); std::swap(MustBeFirstL, MustBeFirstR); } bool NegateR; bool NegateAfterR; bool NegateL; bool NegateAfterAll; if (Opcode == ISD::OR) { // Swap the sub-tree that we can negate naturally to the left. if (!CanNegateL) { assert(CanNegateR && "at least one side must be negatable"); assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); assert(!Negate); std::swap(LHS, RHS); NegateR = false; NegateAfterR = true; } else { // Negate the left sub-tree if possible, otherwise negate the result. NegateR = CanNegateR; NegateAfterR = !CanNegateR; } NegateL = true; NegateAfterAll = !Negate; } else { assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); assert(!Negate && "Valid conjunction/disjunction tree"); NegateL = false; NegateR = false; NegateAfterR = false; NegateAfterAll = false; } // Emit sub-trees. AArch64CC::CondCode RHSCC; SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); if (NegateAfterR) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); if (NegateAfterAll) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). /// In some cases this is even possible with OR operations in the expression. /// See \ref AArch64CCMP. /// \see emitConjunctionRec(). static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC) { bool DummyCanNegate; bool DummyMustBeFirst; if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) return SDValue(); return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); } /// @} /// Returns how profitable it is to fold a comparison's operand's shift and/or /// extension operations. static unsigned getCmpOperandFoldingProfit(SDValue Op) { auto isSupportedExtend = [&](SDValue V) { if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) return true; if (V.getOpcode() == ISD::AND) if (ConstantSDNode *MaskCst = dyn_cast(V.getOperand(1))) { uint64_t Mask = MaskCst->getZExtValue(); return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); } return false; }; if (!Op.hasOneUse()) return 0; if (isSupportedExtend(Op)) return 1; unsigned Opc = Op.getOpcode(); if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) if (ConstantSDNode *ShiftCst = dyn_cast(Op.getOperand(1))) { uint64_t Shift = ShiftCst->getZExtValue(); if (isSupportedExtend(Op.getOperand(0))) return (Shift <= 4) ? 2 : 1; EVT VT = Op.getValueType(); if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) return 1; } return 0; } static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl) { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); if (!isLegalArithImmed(C)) { // Constant does not fit, try adjusting it by one? switch (CC) { default: break; case ISD::SETLT: case ISD::SETGE: if ((VT == MVT::i32 && C != 0x80000000 && isLegalArithImmed((uint32_t)(C - 1))) || (VT == MVT::i64 && C != 0x80000000ULL && isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULT: case ISD::SETUGE: if ((VT == MVT::i32 && C != 0 && isLegalArithImmed((uint32_t)(C - 1))) || (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETLE: case ISD::SETGT: if ((VT == MVT::i32 && C != INT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || (VT == MVT::i64 && C != INT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULE: case ISD::SETUGT: if ((VT == MVT::i32 && C != UINT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || (VT == MVT::i64 && C != UINT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; RHS = DAG.getConstant(C, dl, VT); } break; } } } // Comparisons are canonicalized so that the RHS operand is simpler than the // LHS one, the extreme case being when RHS is an immediate. However, AArch64 // can fold some shift+extend operations on the RHS operand, so swap the // operands if that can be done. // // For example: // lsl w13, w11, #1 // cmp w13, w12 // can be turned into: // cmp w12, w11, lsl #1 if (!isa(RHS) || !isLegalArithImmed(cast(RHS)->getZExtValue())) { SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); } } SDValue Cmp; AArch64CC::CondCode AArch64CC; if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa(RHS)) { const ConstantSDNode *RHSC = cast(RHS); // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. // For the i8 operand, the largest immediate is 255, so this can be easily // encoded in the compare instruction. For the i16 operand, however, the // largest immediate cannot be encoded in the compare. // Therefore, use a sign extending load and cmn to avoid materializing the // -1 constant. For example, // movz w1, #65535 // ldrh w0, [x0, #0] // cmp w0, w1 // > // ldrsh w0, [x0, #0] // cmn w0, #1 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) // if and only if (sext LHS) == (sext RHS). The checks are in place to // ensure both the LHS and RHS are truly zero extended and to make sure the // transformation is profitable. if ((RHSC->getZExtValue() >> 16 == 0) && isa(LHS) && cast(LHS)->getExtensionType() == ISD::ZEXTLOAD && cast(LHS)->getMemoryVT() == MVT::i16 && LHS.getNode()->hasNUsesOfValue(1, 0)) { int16_t ValueofRHS = cast(RHS)->getZExtValue(); if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, DAG.getValueType(MVT::i16)); Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, RHS.getValueType()), CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } } if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); } } } if (!Cmp) { Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); return Cmp; } static std::pair getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && "Unsupported value type"); SDValue Value, Overflow; SDLoc DL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); unsigned Opc = 0; switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::SADDO: Opc = AArch64ISD::ADDS; CC = AArch64CC::VS; break; case ISD::UADDO: Opc = AArch64ISD::ADDS; CC = AArch64CC::HS; break; case ISD::SSUBO: Opc = AArch64ISD::SUBS; CC = AArch64CC::VS; break; case ISD::USUBO: Opc = AArch64ISD::SUBS; CC = AArch64CC::LO; break; // Multiply needs a little bit extra work. case ISD::SMULO: case ISD::UMULO: { CC = AArch64CC::NE; bool IsSigned = Op.getOpcode() == ISD::SMULO; if (Op.getValueType() == MVT::i32) { unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; // For a 32 bit multiply with overflow check we want the instruction // selector to generate a widening multiply (SMADDL/UMADDL). For that we // need to generate the following pattern: // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, DAG.getConstant(0, DL, MVT::i64)); // On AArch64 the upper 32 bits are always zero extended for a 32 bit // operation. We need to clear out the upper 32 bits, because we used a // widening multiply that wrote all 64 bits. In the end this should be a // noop. Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); if (IsSigned) { // The signed overflow check requires more than just a simple check for // any bit set in the upper 32 bits of the result. These bits could be // just the sign bits of a negative number. To perform the overflow // check we have to arithmetic shift right the 32nd bit of the result by // 31 bits. Then we compare the result to the upper 32 bits. SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, DAG.getConstant(32, DL, MVT::i64)); UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, DAG.getConstant(31, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) .getValue(1); } else { // The overflow check for unsigned multiply is easy. We only need to // check if any of the upper 32 bits are set. This can be done with a // CMP (shifted register). For that we need to generate the following // pattern: // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, DAG.getConstant(32, DL, MVT::i64)); SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, DL, MVT::i64), UpperBits).getValue(1); } break; } assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); // For the 64 bit multiply Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); if (IsSigned) { SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, DAG.getConstant(63, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) .getValue(1); } else { SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, DL, MVT::i64), UpperBits).getValue(1); } break; } } // switch (...) if (Opc) { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); // Emit the AArch64 operation with overflow check. Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); Overflow = Value.getValue(1); } return std::make_pair(Value, Overflow); } SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; } // Returns true if the given Op is the overflow flag result of an overflow // intrinsic operation. static bool isOverflowIntrOpRes(SDValue Op) { unsigned Opc = Op.getOpcode(); return (Op.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)); } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { SDValue Sel = Op.getOperand(0); SDValue Other = Op.getOperand(1); SDLoc dl(Sel); // If the operand is an overflow checking operation, invert the condition // code and kill the Not operation. I.e., transform: // (xor (overflow_op_bool, 1)) // --> // (csel 1, 0, invert(cc), overflow_op_bool) // ... which later gets transformed to just a cset instruction with an // inverted condition code, rather than a cset + eor sequence. if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) return SDValue(); SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); AArch64CC::CondCode CC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, CCVal, Overflow); } // If neither operand is a SELECT_CC, give up. if (Sel.getOpcode() != ISD::SELECT_CC) std::swap(Sel, Other); if (Sel.getOpcode() != ISD::SELECT_CC) return Op; // The folding we want to perform is: // (xor x, (select_cc a, b, cc, 0, -1) ) // --> // (csel x, (xor x, -1), cc ...) // // The latter will get matched to a CSINV instruction. ISD::CondCode CC = cast(Sel.getOperand(4))->get(); SDValue LHS = Sel.getOperand(0); SDValue RHS = Sel.getOperand(1); SDValue TVal = Sel.getOperand(2); SDValue FVal = Sel.getOperand(3); // FIXME: This could be generalized to non-integer comparisons. if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return Op; ConstantSDNode *CFVal = dyn_cast(FVal); ConstantSDNode *CTVal = dyn_cast(TVal); // The values aren't constants, this isn't the pattern we're looking for. if (!CFVal || !CTVal) return Op; // We can commute the SELECT_CC by inverting the condition. This // might be needed to make this fit into a CSINV pattern. if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } // If the constants line up, perform the transform! if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); FVal = Other; TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, DAG.getConstant(-1ULL, dl, Other.getValueType())); return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, CCVal, Cmp); } return Op; } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned Opc; bool ExtraOp = false; switch (Op.getOpcode()) { default: llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = AArch64ISD::ADDS; break; case ISD::SUBC: Opc = AArch64ISD::SUBS; break; case ISD::ADDE: Opc = AArch64ISD::ADCS; ExtraOp = true; break; case ISD::SUBE: Opc = AArch64ISD::SBCS; ExtraOp = true; break; } if (!ExtraOp) return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); SDLoc dl(Op); AArch64CC::CondCode CC; // The actual operation that sets the overflow or carry flag. SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); // We use 0 and 1 as false and true values. SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); // We use an inverted condition, because the conditional select is inverted // too. This will allow it to be selected to a single instruction: // CSINC Wd, WZR, WZR, invert(cond). SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, CCVal, Overflow); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } // Prefetch operands are: // 1: Address to prefetch // 2: bool isWrite // 3: int locality (0 = no locality ... 3 = extreme locality) // 4: bool isDataCache static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); unsigned IsWrite = cast(Op.getOperand(2))->getZExtValue(); unsigned Locality = cast(Op.getOperand(3))->getZExtValue(); unsigned IsData = cast(Op.getOperand(4))->getZExtValue(); bool IsStream = !Locality; // When the locality number is set if (Locality) { // The front-end should have filtered out the out-of-range values assert(Locality <= 3 && "Prefetch locality out-of-range"); // The locality degree is the opposite of the cache speed. // Put the number the other way around. // The encoding starts at 0 for level 1 Locality = 3 - Locality; } // built the mask value encoding the expected behavior. unsigned PrfOp = (IsWrite << 4) | // Load/Store bit (!IsData << 3) | // IsDataCache bit (Locality << 1) | // Cache level bits (unsigned)IsStream; // Stream bit return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); RTLIB::Libcall LC; LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); return LowerF128Call(Op, DAG, LC); } SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { if (Op.getOperand(0).getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); // FP_ROUND node has a second operand indicating whether it is known to be // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, SDLoc(Op)).first; } SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); unsigned NumElts = InVT.getVectorNumElements(); // f16 conversions are promoted to f32 when full fp16 is not supported. if (InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) { MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); } if (VT.getSizeInBits() < InVT.getSizeInBits()) { SDLoc dl(Op); SDValue Cv = DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); } if (VT.getSizeInBits() > InVT.getSizeInBits()) { SDLoc dl(Op); MVT ExtVT = MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), VT.getVectorNumElements()); SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); return DAG.getNode(Op.getOpcode(), dl, VT, Ext); } // Type changing conversions are illegal. return Op; } SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { if (Op.getOperand(0).getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); // f16 conversions are promoted to f32 when full fp16 is not supported. if (Op.getOperand(0).getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); } if (Op.getOperand(0).getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. EVT VT = Op.getValueType(); SDLoc dl(Op); SDValue In = Op.getOperand(0); EVT InVT = In.getValueType(); if (VT.getSizeInBits() < InVT.getSizeInBits()) { MVT CastVT = MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), InVT.getVectorNumElements()); In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() > InVT.getSizeInBits()) { unsigned CastOpc = Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; EVT CastVT = VT.changeVectorElementTypeToInteger(); In = DAG.getNode(CastOpc, dl, CastVT, In); return DAG.getNode(Op.getOpcode(), dl, VT, In); } return Op; } SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVectorINT_TO_FP(Op, DAG); // f16 conversions are promoted to f32 when full fp16 is not supported. if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { SDLoc dl(Op); return DAG.getNode( ISD::FP_ROUND, dl, MVT::f16, DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), DAG.getIntPtrConstant(0, dl)); } // i128 conversions are libcalls. if (Op.getOperand(0).getValueType() == MVT::i128) return SDValue(); // Other conversions are legal, unless it's to the completely software-based // fp128. if (Op.getValueType() != MVT::f128) return Op; RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); return LowerF128Call(Op, DAG, LC); } SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { // For iOS, we want to call an alternative entry point: __sincos_stret, // which returns the values in two S / D registers. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); ArgListTy Args; ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; const char *LibcallName = getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); StructType *RetTy = StructType::get(ArgTy, ArgTy); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; } static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { if (Op.getValueType() != MVT::f16) return SDValue(); assert(Op.getOperand(0).getValueType() == MVT::i16); SDLoc DL(Op); Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); return SDValue( DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); } static EVT getExtensionTo64Bits(const EVT &OrigVT) { if (OrigVT.getSizeInBits() >= 64) return OrigVT; assert(OrigVT.isSimple() && "Expecting a simple value type"); MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; switch (OrigSimpleTy) { default: llvm_unreachable("Unexpected Vector Type"); case MVT::v2i8: case MVT::v2i16: return MVT::v2i32; case MVT::v4i8: return MVT::v4i16; } } static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode) { // The vector originally had a size of OrigTy. It was then extended to ExtTy. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than // 64-bits we need to insert a new extension so that it will be 64-bits. assert(ExtTy.is128BitVector() && "Unexpected extension size"); if (OrigTy.getSizeInBits() >= 64) return N; // Must extend size to at least 64 bits to be used as an operand for VMULL. EVT NewVT = getExtensionTo64Bits(OrigTy); return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { EVT VT = N->getValueType(0); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (const SDValue &Elt : N->op_values()) { if (ConstantSDNode *C = dyn_cast(Elt)) { unsigned EltSize = VT.getScalarSizeInBits(); unsigned HalfSize = EltSize / 2; if (isSigned) { if (!isIntN(HalfSize, C->getSExtValue())) return false; } else { if (!isUIntN(HalfSize, C->getZExtValue())) return false; } continue; } return false; } return true; } static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), N->getOpcode()); assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); EVT VT = N->getValueType(0); SDLoc dl(N); unsigned EltSize = VT.getScalarSizeInBits() / 2; unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); SmallVector Ops; for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); // Element types smaller than 32 bits are not legal, so use i32 elements. // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { return N->getOpcode() == ISD::SIGN_EXTEND || isExtendedBUILD_VECTOR(N, DAG, true); } static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { return N->getOpcode() == ISD::ZERO_EXTEND || isExtendedBUILD_VECTOR(N, DAG, false); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isSignExtended(N0, DAG) && isSignExtended(N1, DAG); } return false; } static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); } return false; } SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { // The rounding mode is in bits 23:22 of the FPSCR. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)); SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, DAG.getConstant(3, dl, MVT::i32)); } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; bool isMLA = false; bool isN0SExt = isSignExtended(N0, DAG); bool isN1SExt = isSignExtended(N1, DAG); if (isN0SExt && isN1SExt) NewOpc = AArch64ISD::SMULL; else { bool isN0ZExt = isZeroExtended(N0, DAG); bool isN1ZExt = isZeroExtended(N1, DAG); if (isN0ZExt && isN1ZExt) NewOpc = AArch64ISD::UMULL; else if (isN1SExt || isN1ZExt) { // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these // into (s/zext A * s/zext C) + (s/zext B * s/zext C) if (isN1SExt && isAddSubSExt(N0, DAG)) { NewOpc = AArch64ISD::SMULL; isMLA = true; } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { NewOpc = AArch64ISD::UMULL; isMLA = true; } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { std::swap(N0, N1); NewOpc = AArch64ISD::UMULL; isMLA = true; } } if (!NewOpc) { if (VT == MVT::v2i64) // Fall through to expand this. It is not legal. return SDValue(); else // Other vector multiplications are legal. return Op; } } // Legalize to a S/UMULL instruction SDLoc DL(Op); SDValue Op0; SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); if (!isMLA) { Op0 = skipExtensionForVectorMULL(N0, DAG); assert(Op0.getValueType().is64BitVector() && Op1.getValueType().is64BitVector() && "unexpected types for extended operands to VMULL"); return DAG.getNode(NewOpc, DL, VT, Op0, Op1); } // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during // isel lowering to take advantage of no-stall back to back s/umul + s/umla. // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); EVT Op1VT = Op1.getValueType(); return DAG.getNode(N0->getOpcode(), DL, VT, DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::aarch64_neon_abs: { EVT Ty = Op.getValueType(); if (Ty == MVT::i64) { SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op.getOperand(1)); Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result); return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result); } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) { return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1)); } else { report_fatal_error("Unexpected type for AArch64 NEON intrinic"); } } case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_umax: return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_smin: return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_umin: return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::localaddress: { const auto &MF = DAG.getMachineFunction(); const auto *RegInfo = Subtarget->getRegisterInfo(); unsigned Reg = RegInfo->getLocalAddressRegister(MF); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, Op.getSimpleValueType()); } case Intrinsic::eh_recoverfp: { // FIXME: This needs to be implemented to correctly handle highly aligned // stack objects. For now we simply return the incoming FP. Refer D53541 // for more details. SDValue FnOp = Op.getOperand(1); SDValue IncomingFPOp = Op.getOperand(2); GlobalAddressSDNode *GSD = dyn_cast(FnOp); auto *Fn = dyn_cast_or_null(GSD ? GSD->getGlobal() : nullptr); if (!Fn) report_fatal_error( "llvm.eh.recoverfp must take a function as the first argument"); return IncomingFPOp; } } } // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG) { assert(VT.isVector() && "VT should be a vector type"); assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); SDValue Value = ST->getValue(); // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract // the word lane which represent the v4i8 subvector. It optimizes the store // to: // // xtn v0.8b, v0.8h // str s0, [x0] SDValue Undef = DAG.getUNDEF(MVT::i16); SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, {Undef, Undef, Undef, Undef}); SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Value, UndefVec); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Trunc, DAG.getConstant(0, DL, MVT::i64)); return DAG.getStore(ST->getChain(), DL, ExtractTrunc, ST->getBasePtr(), ST->getMemOperand()); } // Custom lowering for any store, vector or scalar and/or default or with // a truncate operations. Currently only custom lower truncate operation // from vector v4i16 to v4i8. SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc Dl(Op); StoreSDNode *StoreNode = cast(Op); assert (StoreNode && "Can only custom lower store nodes"); SDValue Value = StoreNode->getValue(); EVT VT = Value.getValueType(); EVT MemVT = StoreNode->getMemoryVT(); assert (VT.isVector() && "Can only custom lower vector store types"); unsigned AS = StoreNode->getAddressSpace(); unsigned Align = StoreNode->getAlignment(); if (Align < MemVT.getStoreSize() && !allowsMisalignedMemoryAccesses( MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) { return scalarizeVectorStore(StoreNode, DAG); } if (StoreNode->isTruncatingStore()) { return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); } return SDValue(); } SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); LLVM_DEBUG(Op.dump()); switch (Op.getOpcode()) { default: llvm_unreachable("unimplemented operand"); return SDValue(); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: return LowerF128Call(Op, DAG, RTLIB::ADD_F128); case ISD::FSUB: return LowerF128Call(Op, DAG, RTLIB::SUB_F128); case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::SPONENTRY: return LowerSPONENTRY(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerVectorSRA_SRL_SHL(Op, DAG); case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTPOP: return LowerCTPOP(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::OR: return LowerVectorOR(Op, DAG); case ISD::XOR: return LowerXOR(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: return LowerVECREDUCE(Op, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerATOMIC_LOAD_SUB(Op, DAG); case ISD::ATOMIC_LOAD_AND: return LowerATOMIC_LOAD_AND(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } } //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const { switch (CC) { default: report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: return CC_AArch64_GHC; case CallingConv::C: case CallingConv::Fast: case CallingConv::PreserveMost: case CallingConv::CXX_FAST_TLS: case CallingConv::Swift: if (Subtarget->isTargetWindows() && IsVarArg) return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; case CallingConv::Win64: return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; case CallingConv::AArch64_VectorCall: return CC_AArch64_AAPCS; } } CCAssignFn * AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; } SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // At this point, Ins[].VT may already be promoted to i32. To correctly // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here // we use a special version of AnalyzeFormalArguments to pass in ValVT and // LocVT. unsigned NumArgs = Ins.size(); Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Ins[i].VT; if (Ins[i].isOrigArg()) { std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[i].getOrigArgIndex(); // Get type of the original argument. EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) ValVT = MVT::i8; else if (ActualMVT == MVT::i16) ValVT = MVT::i16; } CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } assert(ArgLocs.size() == Ins.size()); SmallVector ArgValues; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (Ins[i].Flags.isByVal()) { // Byval is used for HFAs in the PCS, but the system should work in a // non-compliant manner for larger structs. EVT PtrVT = getPointerTy(DAG.getDataLayout()); int Size = Ins[i].Flags.getByValSize(); unsigned NumRegs = (Size + 7) / 8; // FIXME: This works on big-endian for composite byvals, which are the common // case. It should also work for fundamental types too. unsigned FrameIdx = MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); InVals.push_back(FrameIdxN); continue; } if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); SDValue ArgValue; const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = &AArch64::GPR32RegClass; else if (RegVT == MVT::i64) RC = &AArch64::GPR64RegClass; else if (RegVT == MVT::f16) RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; else if (RegVT == MVT::f64 || RegVT.is64BitVector()) RC = &AArch64::FPR64RegClass; else if (RegVT == MVT::f128 || RegVT.is128BitVector()) RC = &AArch64::FPR128RegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); // If this is an 8, 16 or 32-bit value, it is really passed promoted // to 64 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; case CCValAssign::AExt: case CCValAssign::SExt: case CCValAssign::ZExt: // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt // nodes after our lowering. assert(RegVT == Ins[i].VT && "incorrect register location selected"); break; } InVals.push_back(ArgValue); } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && !Ins[i].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue ArgValue; // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; MVT MemVT = VA.getValVT(); switch (VA.getLocInfo()) { default: break; case CCValAssign::BCvt: MemVT = VA.getLocVT(); break; case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; case CCValAssign::ZExt: ExtType = ISD::ZEXTLOAD; break; case CCValAssign::AExt: ExtType = ISD::EXTLOAD; break; } ArgValue = DAG.getExtLoad( ExtType, DL, VA.getLocVT(), Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT); InVals.push_back(ArgValue); } } // varargs AArch64FunctionInfo *FuncInfo = MF.getInfo(); if (isVarArg) { if (!Subtarget->isTargetDarwin() || IsWin64) { // The AAPCS variadic function ABI is identical to the non-variadic // one. As a result there may be more arguments in registers and we should // save them for future reference. // Win64 variadic functions also pass arguments in registers, but all float // arguments are passed in integer registers. saveVarArgRegisters(CCInfo, DAG, DL, Chain); } // This will point to the next argument passed via stack. unsigned StackOffset = CCInfo.getNextStackOffset(); // We currently pass all varargs at 8-byte alignment. StackOffset = ((StackOffset + 7) & ~7); FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); if (MFI.hasMustTailInVarArgFunc()) { SmallVector RegParmTypes; RegParmTypes.push_back(MVT::i64); RegParmTypes.push_back(MVT::f128); // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_AArch64_AAPCS); // Conservatively forward X8, since it might be used for aggregate return. if (!CCInfo.isAllocated(AArch64::X8)) { unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); } } } // On Windows, InReg pointers must be returned, so record the pointer in a // virtual register at the start of the function so it can be returned in the // epilogue. if (IsWin64) { for (unsigned I = 0, E = Ins.size(); I != E; ++I) { if (Ins[I].Flags.isInReg()) { assert(!FuncInfo->getSRetReturnReg()); MVT PtrTy = getPointerTy(DAG.getDataLayout()); unsigned Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); break; } } } unsigned StackArgSize = CCInfo.getNextStackOffset(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { // This is a non-standard ABI so by fiat I say we're allowed to make full // use of the stack area to be popped, which must be aligned to 16 bytes in // any case: StackArgSize = alignTo(StackArgSize, 16); // If we're expected to restore the stack (e.g. fastcc) then we'll be adding // a multiple of 16. FuncInfo->setArgumentStackToRestore(StackArgSize); // This realignment carries over to the available bytes below. Our own // callers will guarantee the space is free by giving an aligned value to // CALLSEQ_START. } // Even if we're not expected to free up the space, it's useful to know how // much is there while considering tail calls (because we can reuse it). FuncInfo->setBytesInStackArgArea(StackArgSize); if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); return Chain; } void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); auto PtrVT = getPointerTy(DAG.getDataLayout()); bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); SmallVector MemOps; static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 }; static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; if (GPRSaveSize != 0) { if (IsWin64) { GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); if (GPRSaveSize & 15) // The extra size here, if triggered, will always be 8. MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); } else GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, IsWin64 ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), GPRIdx, (i - FirstVariadicGPR) * 8) : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); } } FuncInfo->setVarArgsGPRIndex(GPRIdx); FuncInfo->setVarArgsGPRSize(GPRSaveSize); if (Subtarget->hasFPARMv8() && !IsWin64) { static const MCPhysReg FPRArgRegs[] = { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; if (FPRSaveSize != 0) { FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false); SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); } } FuncInfo->setVarArgsFPRIndex(FPRIdx); FuncInfo->setVarArgsFPRSize(FPRSaveSize); } if (!MemOps.empty()) { Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } } /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; // Assign locations to each value returned by this call. SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference if (i == 0 && isThisReturn) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); continue; } SDValue Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); InFlag = Val.getValue(2); switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; } InVals.push_back(Val); } return Chain; } /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { return CC == CallingConv::Fast; } /// Return true if we might ever do TCO for calls with this calling convention. static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { case CallingConv::C: case CallingConv::PreserveMost: case CallingConv::Swift: return true; default: return canGuaranteeTCO(CC); } } bool AArch64TargetLowering::isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { if (!mayTailCallThisCC(CalleeCC)) return false; MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; // Byval parameters hand the function a pointer directly into the stack area // we want to reuse during a tail call. Working around this *is* possible (see // X86) but less efficient and uglier in LowerCall. for (Function::const_arg_iterator i = CallerF.arg_begin(), e = CallerF.arg_end(); i != e; ++i) { if (i->hasByValAttr()) return false; // On Windows, "inreg" attributes signify non-aggregate indirect returns. // In this case, it is necessary to save/restore X0 in the callee. Tail // call opt interferes with this. So we disable tail call opt when the // caller has an argument with "inreg" attribute. // FIXME: Check whether the callee also has an "inreg" argument. if (i->hasInRegAttr()) return false; } if (getTargetMachine().Options.GuaranteedTailCallOpt) return canGuaranteeTCO(CalleeCC) && CCMatch; // Externally-defined functions with weak linkage should not be // tail-called on AArch64 when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls // to undefined weak functions to be replaced with a NOP or jump to the // next instruction. The behaviour of branch instructions in this // situation (as used for tail calls) is implementation-defined, so we // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } // Now we search for cases where we can use a tail call without changing the // ABI. Sibcall is used in some places (particularly gcc) to refer to this // concept. // I want anyone implementing a new calling convention to think long and hard // about this assert. assert((!isVarArg || CalleeCC == CallingConv::C) && "Unexpected variadic calling convention"); LLVMContext &C = *DAG.getContext(); if (isVarArg && !Outs.empty()) { // At least two cases here: if caller is fastcc then we can't have any // memory arguments (we'd be expected to clean up the stack afterwards). If // caller is C then we could potentially use its argument area. // FIXME: for now we take the most conservative of these in both cases: // disallow all variadic memory operands. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); for (const CCValAssign &ArgLoc : ArgLocs) if (!ArgLoc.isRegLoc()) return false; } // Check that the call results are passed in the same way. if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, CCAssignFnForCall(CalleeCC, isVarArg), CCAssignFnForCall(CallerCC, isVarArg))) return false; // The callee has to preserve all registers the caller needs to preserve. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); if (Subtarget->hasCustomCallingConv()) { TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); } if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } // Nothing more to check if the callee is taking no arguments if (Outs.empty()) return true; SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); const AArch64FunctionInfo *FuncInfo = MF.getInfo(); // If the stack arguments for this call do not fit into our own save area then // the call cannot be made tail. if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) return false; const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) return false; return true; } SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const { SmallVector ArgChains; int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; // Include the original chain at the beginning of the list. When this is // used by target LowerCall hooks, this helps legalize find the // CALLSEQ_BEGIN node. ArgChains.push_back(Chain); // Add a chain value for each stack argument corresponding for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U) if (LoadSDNode *L = dyn_cast(*U)) if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) if (FI->getIndex() < 0) { int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); int64_t InLastByte = InFirstByte; InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || (FirstByte <= InFirstByte && InFirstByte <= LastByte)) ArgChains.push_back(SDValue(L, 1)); } // Build a tokenfactor for all the chains. return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); } bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const { return CallCC == CallingConv::Fast && TailCallOpt; } /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, /// and add input and output parameter nodes. SDValue AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &DL = CLI.DL; SmallVector &Outs = CLI.Outs; SmallVector &OutVals = CLI.OutVals; SmallVector &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; bool IsSibCall = false; if (IsTailCall) { // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // A sibling call is one where we're under the usual C ABI and not planning // to change that but can still do a tail call: if (!TailCallOpt && IsTailCall) IsSibCall = true; if (IsTailCall) ++NumTailCalls; } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); if (IsVarArg) { // Handle fixed and variable vector arguments differently. // Variable vector arguments always go into memory. unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/ !Outs[i].IsFixed); bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } } else { // At this point, Outs[].VT may already be promoted to i32. To correctly // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here // we use a special version of AnalyzeCallOperands to pass in ValVT and // LocVT. unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Outs[i].VT; // Get type of the original argument. EVT ActualVT = getValueType(DAG.getDataLayout(), CLI.getArgs()[Outs[i].OrigArgIndex].Ty, /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) ValVT = MVT::i8; else if (ActualMVT == MVT::i16) ValVT = MVT::i16; CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } } // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); if (IsSibCall) { // Since we're not changing the ABI to make this a tail call, the memory // operands are already available in the caller's incoming argument space. NumBytes = 0; } // FPDiff is the byte offset of the call's argument area from the callee's. // Stores to callee stack arguments will be placed in FixedStackSlots offset // by this amount for a tail call. In a sibling call it must be 0 because the // caller will deallocate the entire stack and the callee still expects its // arguments to begin at SP+0. Completely unused for non-tail calls. int FPDiff = 0; if (IsTailCall && !IsSibCall) { unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); // Since callee will pop argument stack as a tail call, we must keep the // popped size 16-byte aligned. NumBytes = alignTo(NumBytes, 16); // FPDiff will be negative if this tail call requires more space than we // would automatically have in our incoming argument space. Positive if we // can actually shrink the stack. FPDiff = NumReusableBytes - NumBytes; // The stack pointer must be 16-byte aligned at all times it's used for a // memory operation, which in practice means at *all* times and in // particular across call boundaries. Therefore our own arguments started at // a 16-byte aligned SP and the delta applied for the tail call should // satisfy the same constraint. assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); } // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); SmallVector, 8> RegsToPass; SmallVector MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) { const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); } } // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::AExt: if (Outs[realArgIdx].ArgVT == MVT::i1) { // AAPCS requires i1 to be zero-extended to 8-bits by the caller. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); } Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; case CCValAssign::FPExt: Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); break; } if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i64) { assert(VA.getLocVT() == MVT::i64 && "unexpected calling convention register assignment"); assert(!Ins.empty() && Ins[0].VT == MVT::i64 && "unexpected use of 'returned'"); IsThisReturn = true; } RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { assert(VA.isMemLoc()); SDValue DstAddr; MachinePointerInfo DstInfo; // FIXME: This works on big-endian for composite byvals, which are the // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal() && !Flags.isInConsecutiveRegs()) { if (OpSize < 8) BEAlign = 8 - OpSize; } unsigned LocMemOffset = VA.getLocMemOffset(); int32_t Offset = LocMemOffset + BEAlign; SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); if (IsTailCall) { Offset = Offset + FPDiff; int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); DstAddr = DAG.getFrameIndex(FI, PtrVT); DstInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Make sure any stack arguments overlapping with where we're storing // are loaded before this eventual operation. Otherwise they'll be // clobbered. Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); } else { SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset); } if (Outs[i].Flags.isByVal()) { SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); SDValue Cpy = DAG.getMemcpy( Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), /*isVol = */ false, /*AlwaysInline = */ false, /*isTailCall = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already // promoted to a legal register type i32, we should truncate Arg back to // i1/i8/i16. if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16) Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); MemOpChains.push_back(Store); } } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (auto &RegToPass : RegsToPass) { Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, RegToPass.second, InFlag); InFlag = Chain.getValue(1); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. if (auto *G = dyn_cast(Callee)) { auto GV = G->getGlobal(); if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) == AArch64II::MO_GOT) { Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT); } else { const GlobalValue *GV = G->getGlobal(); Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); } } else if (auto *S = dyn_cast(Callee)) { if (getTargetMachine().getCodeModel() == CodeModel::Large && Subtarget->isTargetMachO()) { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } else { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } } // We don't usually want to end the call-sequence here because we would tidy // the frame up *after* the call, however in the ABI-changing tail-call case // we've carefully laid out the parameters so that when sp is reset they'll be // in the correct location. if (IsTailCall && !IsSibCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DAG.getIntPtrConstant(0, DL, true), InFlag, DL); InFlag = Chain.getValue(1); } std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so // this information must travel along with the operation for eventual // consumption by emitEpilogue. Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); } // Add argument registers to the end of the list so that they are known live // into the call. for (auto &RegToPass : RegsToPass) Ops.push_back(DAG.getRegister(RegToPass.first, RegToPass.second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); if (IsThisReturn) { // For 'this' returns, use the X0-preserving mask if applicable Mask = TRI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { IsThisReturn = false; Mask = TRI->getCallPreservedMask(MF, CallConv); } } else Mask = TRI->getCallPreservedMask(MF, CallConv); if (Subtarget->hasCustomCallingConv()) TRI->UpdateCustomCallPreservedMask(MF, &Mask); if (TRI->isAnyArgRegReserved(MF)) TRI->emitReservedArgRegCallError(MF); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); // If we're doing a tall call, use a TC_RETURN here rather than an // actual call instruction. if (IsTailCall) { MF.getFrameInfo().setHasTailCall(); return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DAG.getIntPtrConstant(CalleePopBytes, DL, true), InFlag, DL); if (!Ins.empty()) InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, InVals, IsThisReturn, IsThisReturn ? OutVals[0] : SDValue()); } bool AArch64TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); } SDValue AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { auto &MF = DAG.getMachineFunction(); auto *FuncInfo = MF.getInfo(); CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC); // Copy the result values into the output registers. SDValue Flag; SmallVector RetOps(1, Chain); for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[realRVLocIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: if (Outs[i].ArgVT == MVT::i1) { // AAPCS requires i1 to be zero-extended to i8 by the producer of the // value. This is strictly redundant on Darwin (which uses "zeroext // i1"), but will be optimised out before ISel. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); } break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; } Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } // Windows AArch64 ABIs require that for returning structs by value we copy // the sret argument into X0 for the return. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into X0. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, getPointerTy(MF.getDataLayout())); unsigned RetValReg = AArch64::X0; Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag); Flag = Chain.getValue(1); RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (AArch64::GPR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else if (AArch64::FPR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); } //===----------------------------------------------------------------------===// // Other Lowering Code //===----------------------------------------------------------------------===// SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, N->getOffset(), Flag); } SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); } SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), N->getOffset(), Flag); } SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); } // (loadGOT sym) template SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes instead of using a wrapper node. return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); } // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) template SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( AArch64ISD::WrapperLarge, DL, Ty, getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); } // (addlow (adrp %hi(sym)) %lo(sym)) template SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); SDValue Lo = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); } // (adr sym) template SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue Sym = getTargetNode(N, Ty, DAG, Flags); return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym); } SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GN = cast(Op); const GlobalValue *GV = GN->getGlobal(); unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); if (OpFlags != AArch64II::MO_NO_FLAG) assert(cast(Op)->getOffset() == 0 && "unexpected offset in global node"); // This also catches the large code model case for Darwin, and tiny code // model with got relocations. if ((OpFlags & AArch64II::MO_GOT) != 0) { return getGOT(GN, DAG, OpFlags); } SDValue Result; if (getTargetMachine().getCodeModel() == CodeModel::Large) { Result = getAddrLarge(GN, DAG, OpFlags); } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { Result = getAddrTiny(GN, DAG, OpFlags); } else { Result = getAddr(GN, DAG, OpFlags); } EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(GN); if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } /// Convert a TLS address reference into the correct sequence of loads /// and calls to compute the variable's address (for Darwin, currently) and /// return an SDValue containing the final node. /// Darwin only has one TLS scheme which must be capable of dealing with the /// fully general situation, in the worst case. This means: /// + "extern __thread" declaration. /// + Defined in a possibly unknown dynamic library. /// /// The general system is that each __thread variable has a [3 x i64] descriptor /// which contains information used by the runtime to calculate the address. The /// only part of this the compiler needs to know about is the first xword, which /// contains a function pointer that must be called with the address of the /// entire descriptor in "x0". /// /// Since this descriptor may be in a different unit, in general even the /// descriptor must be accessed via an indirect load. The "ideal" code sequence /// is: /// adrp x0, _var@TLVPPAGE /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, /// ; the function pointer /// blr x1 ; Uses descriptor address in x0 /// ; Address of _var is now in x0. /// /// If the address of _var's descriptor *is* known to the linker, then it can /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for /// a slight efficiency gain. SDValue AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "This function expects a Darwin target"); SDLoc DL(Op); MVT PtrVT = getPointerTy(DAG.getDataLayout()); const GlobalValue *GV = cast(Op)->getGlobal(); SDValue TLVPAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); // The first entry in the descriptor is a function pointer that we must call // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), /* Alignment = */ 8, MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); Chain = FuncTLVGet.getValue(1); MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *Mask = TRI->getTLSCallPreservedMask(); if (Subtarget->hasCustomCallingConv()) TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: x0 takes the address of the descriptor, and // returns the address of the variable in this thread. Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); Chain = DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), DAG.getRegisterMask(Mask), Chain.getValue(1)); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); } /// When accessing thread-local variables under either the general-dynamic or /// local-dynamic system, we make a "TLS-descriptor" call. The variable will /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry /// is a function pointer to carry out the resolution. /// /// The sequence is: /// adrp x0, :tlsdesc:var /// ldr x1, [x0, #:tlsdesc_lo12:var] /// add x0, x0, #:tlsdesc_lo12:var /// .tlsdesccall var /// blr x1 /// (TPIDR_EL0 offset now in x0) /// /// The above sequence must be produced unscheduled, to enable the linker to /// optimize/relax this sequence. /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the /// above sequence, and expanded really late in the compilation flow, to ensure /// the sequence is produced as per above. SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); SDValue Glue = Chain.getValue(1); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); } SDValue AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "This function expects an ELF target"); if (getTargetMachine().getCodeModel() == CodeModel::Large) report_fatal_error("ELF TLS only supported in small memory model"); // Different choices can be made for the maximum size of the TLS area for a // module. For the small address model, the default TLS size is 16MiB and the // maximum TLS size is 4GiB. // FIXME: add -mtls-size command line option and make it control the 16MiB // vs. 4GiB code sequence generation. // FIXME: add tiny codemodel support. We currently generate the same code as // small, which may be larger than needed. const GlobalAddressSDNode *GA = cast(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); if (!EnableAArch64ELFLocalDynamicTLSGeneration) { if (Model == TLSModel::LocalDynamic) Model = TLSModel::GeneralDynamic; } SDValue TPOff; EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); const GlobalValue *GV = GA->getGlobal(); SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); if (Model == TLSModel::LocalExec) { SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); SDValue TPWithOff_lo = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, HiVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); SDValue TPWithOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); return TPWithOff; } else if (Model == TLSModel::InitialExec) { TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); } else if (Model == TLSModel::LocalDynamic) { // Local-dynamic accesses proceed in two phases. A general-dynamic TLS // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate // the beginning of the module's TLS region, followed by a DTPREL offset // calculation. // These accesses will need deduplicating if there's more than one. AArch64FunctionInfo *MFI = DAG.getMachineFunction().getInfo(); MFI->incNumLocalDynamicTLSAccesses(); // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS); // Now we can calculate the offset from TPIDR_EL0 to this module's // thread-local area. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); // Now use :dtprel_whatever: operations to calculate this variable's offset // in its thread-storage area. SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); } else if (Model == TLSModel::GeneralDynamic) { // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); // Finally we can make a call to calculate the offset from tpidr_el0. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); } else llvm_unreachable("Unsupported ELF TLS access model"); return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); } SDValue AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); // Load the ThreadLocalStoragePointer from the TEB // A pointer to the TLS array is located at offset 0x58 from the TEB. SDValue TLSArray = DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); Chain = TLSArray.getValue(1); // Load the TLS index from the C runtime; // This does the same as getAddr(), but without having a GlobalAddressSDNode. // This also does the same as LOADgot, but using a generic i32 load, // while LOADgot only loads i64. SDValue TLSIndexHi = DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); SDValue TLSIndexLo = DAG.getTargetExternalSymbol( "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); SDValue TLSIndex = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); Chain = TLSIndex.getValue(1); // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 // offset into the TLSArray. TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, DAG.getConstant(3, DL, PtrVT)); SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), MachinePointerInfo()); Chain = TLS.getValue(1); const GlobalAddressSDNode *GA = cast(Op); const GlobalValue *GV = GA->getGlobal(); SDValue TGAHi = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue TGALo = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); // Add the offset from the start of the .tls section (section base). SDValue Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, DAG.getTargetConstant(0, DL, MVT::i32)), 0); Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); return Addr; } SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().useEmulatedTLS()) return LowerToTLSEmulatedModel(GA, DAG); if (Subtarget->isTargetDarwin()) return LowerDarwinGlobalTLSAddress(Op, DAG); if (Subtarget->isTargetELF()) return LowerELFGlobalTLSAddress(Op, DAG); if (Subtarget->isTargetWindows()) return LowerWindowsGlobalTLSAddress(Op, DAG); llvm_unreachable("Unexpected platform trying to use TLS"); } SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); MachineFunction &MF = DAG.getMachineFunction(); // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions // will not be produced, as they are conditional branch instructions that do // not set flags. bool ProduceNonFlagSettingCondBr = !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); // Handle f128 first, since lowering it will result in comparing the return // value of a libcall against zero, which is just what the rest of LowerBR_CC // is expecting to deal with. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) return SDValue(); // The actual operation with overflow check. AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); if (CC == ISD::SETNE) OFCC = getInvertedCondCode(OFCC); SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Overflow); } if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); // If the RHS of the comparison is zero, we can potentially fold this // to a specialized branch. const ConstantSDNode *RHSC = dyn_cast(RHS); if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { if (CC == ISD::SETEQ) { // See if we can use a TBZ to fold in an AND as well. // TBZ has a smaller branch displacement than CBZ. If the offset is // out of bounds, a late MI-layer pass rewrites branches. // 403.gcc is an example that hits this case. if (LHS.getOpcode() == ISD::AND && isa(LHS.getOperand(1)) && isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), dl, MVT::i64), Dest); } return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); } else if (CC == ISD::SETNE) { // See if we can use a TBZ to fold in an AND as well. // TBZ has a smaller branch displacement than CBZ. If the offset is // out of bounds, a late MI-layer pass rewrites branches. // 403.gcc is an example that hits this case. if (LHS.getOpcode() == ISD::AND && isa(LHS.getOperand(1)) && isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), dl, MVT::i64), Dest); } return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. uint64_t Mask = LHS.getValueSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(Mask, dl, MVT::i64), Dest); } } if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. uint64_t Mask = LHS.getValueSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(Mask, dl, MVT::i64), Dest); } SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); } assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue BR1 = DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); if (CC2 != AArch64CC::AL) { SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, Cmp); } return BR1; } SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); if (SrcVT.bitsLT(VT)) In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); else if (SrcVT.bitsGT(VT)) In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; uint64_t EltMask; SDValue VecVal1, VecVal2; auto setVecVal = [&] (int Idx) { if (!VT.isVector()) { VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); } }; if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); EltMask = 0x80000000ULL; setVecVal(AArch64::ssub); } else if (VT == MVT::f64 || VT == MVT::v2f64) { VecVT = MVT::v2i64; // We want to materialize a mask with the high bit set, but the AdvSIMD // immediate moves cannot materialize that in a single instruction for // 64-bit elements. Instead, materialize zero and then negate it. EltMask = 0; setVecVal(AArch64::dsub); } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); EltMask = 0x8000ULL; setVecVal(AArch64::hsub); } else { llvm_unreachable("Invalid type for copysign!"); } SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); // If we couldn't materialize the mask above, then the mask vector will be // the zero vector, and we need to negate it here. if (VT == MVT::f64 || VT == MVT::v2f64) { BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); } SDValue Sel = DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); if (VT == MVT::f16) return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); if (VT == MVT::f32) return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); else if (VT == MVT::f64) return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); else return DAG.getNode(ISD::BITCAST, DL, VT, Sel); } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { if (DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat)) return SDValue(); if (!Subtarget->hasNEON()) return SDValue(); // While there is no integer popcount instruction, it can // be more efficiently lowered to the following sequence that uses // AdvSIMD registers/instructions as long as the copies to/from // the AdvSIMD registers are cheap. // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd // CNT V0.8B, V0.8B // 8xbyte pop-counts // ADDV B0, V0.8B // sum 8xbyte pop-counts // UMOV X0, V0.B[0] // copy byte result back to integer reg SDValue Val = Op.getOperand(0); SDLoc DL(Op); EVT VT = Op.getValueType(); if (VT == MVT::i32 || VT == MVT::i64) { if (VT == MVT::i32) Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); SDValue UaddLV = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; } assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && "Unexpected type for custom ctpop lowering"); EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; Val = DAG.getBitcast(VT8Bit, Val); Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. unsigned EltSize = 8; unsigned NumElts = VT.is64BitVector() ? 8 : 16; while (EltSize != VT.getScalarSizeInBits()) { EltSize *= 2; NumElts /= 2; MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); Val = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); } return Val; } SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc dl(Op); // We chose ZeroOrOneBooleanContents, so use zero and one. EVT VT = Op.getValueType(); SDValue TVal = DAG.getConstant(1, dl, VT); SDValue FVal = DAG.getConstant(0, dl, VT); // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { assert(LHS.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); return LHS; } } if (LHS.getValueType().isInteger()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); } // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead // and do the comparison. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); } else { // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't // totally clean. Some of them require two CSELs to implement. As is in // this case, we emit the first CSEL and then emit a second using the output // of the first as the RHS. We're effectively OR'ing the two CC's together. // FIXME: It would be nice if we could match the two CSELs to two CSINCs. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } } SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Also handle f16, for which we need to do a f32 comparison. if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); } // Next, handle integers. if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); unsigned Opcode = AArch64ISD::CSEL; // If both the TVal and the FVal are constants, see if we can swap them in // order to for a CSINV or CSINC out of them. ConstantSDNode *CFVal = dyn_cast(FVal); ConstantSDNode *CTVal = dyn_cast(TVal); if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } else if (TVal.getOpcode() == ISD::XOR) { // If TVal is a NOT we want to swap TVal and FVal so that we can match // with a CSINV rather than a CSEL. if (isAllOnesConstant(TVal.getOperand(1))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } } else if (TVal.getOpcode() == ISD::SUB) { // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so // that we can match with a CSNEG rather than a CSEL. if (isNullConstant(TVal.getOperand(0))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } } else if (CTVal && CFVal) { const int64_t TrueVal = CTVal->getSExtValue(); const int64_t FalseVal = CFVal->getSExtValue(); bool Swap = false; // If both TVal and FVal are constants, see if FVal is the // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC // instead of a CSEL in that case. if (TrueVal == ~FalseVal) { Opcode = AArch64ISD::CSINV; } else if (TrueVal == -FalseVal) { Opcode = AArch64ISD::CSNEG; } else if (TVal.getValueType() == MVT::i32) { // If our operands are only 32-bit wide, make sure we use 32-bit // arithmetic for the check whether we can use CSINC. This ensures that // the addition in the check will wrap around properly in case there is // an overflow (which would not be the case if we do the check with // 64-bit arithmetic). const uint32_t TrueVal32 = CTVal->getZExtValue(); const uint32_t FalseVal32 = CFVal->getZExtValue(); if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { Opcode = AArch64ISD::CSINC; if (TrueVal32 > FalseVal32) { Swap = true; } } // 64-bit check whether we can use CSINC. } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { Opcode = AArch64ISD::CSINC; if (TrueVal > FalseVal) { Swap = true; } } // Swap TVal and FVal if necessary. if (Swap) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } if (Opcode != AArch64ISD::CSEL) { // Drop FVal since we can get its value by simply inverting/negating // TVal. FVal = TVal; } } // Avoid materializing a constant when possible by reusing a known value in // a register. However, don't perform this optimization if the known value // is one, zero or negative one in the case of a CSEL. We can always // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the // FVal, respectively. ConstantSDNode *RHSVal = dyn_cast(RHS); if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) { AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to // "a != C ? x : a" to avoid materializing C. if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) TVal = LHS; else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) FVal = LHS; } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { assert (CTVal && CFVal && "Expected constant operands for CSNEG."); // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to // avoid materializing C. AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { Opcode = AArch64ISD::CSINV; TVal = LHS; FVal = DAG.getConstant(0, dl, FVal.getValueType()); } } SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); EVT VT = TVal.getValueType(); return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); } // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); assert(LHS.getValueType() == RHS.getValueType()); EVT VT = TVal.getValueType(); SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two CSELs to implement. AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); if (DAG.getTarget().Options.UnsafeFPMath) { // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. ConstantFPSDNode *RHSVal = dyn_cast(RHS); if (RHSVal && RHSVal->isZero()) { ConstantFPSDNode *CFVal = dyn_cast(FVal); ConstantFPSDNode *CTVal = dyn_cast(TVal); if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) TVal = LHS; else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && CFVal && CFVal->isZero() && FVal.getValueType() == LHS.getValueType()) FVal = LHS; } } // Emit first, and possibly only, CSEL. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); // If we need a second CSEL, emit it, using the output of the first as the // RHS. We're effectively OR'ing the two CC's together. if (CC2 != AArch64CC::AL) { SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } // Otherwise, return the output of the first CSEL. return CS1; } SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); SDLoc DL(Op); return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); SDLoc DL(Op); // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select // instruction. if (isOverflowIntrOpRes(CCVal)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) return SDValue(); AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal, Overflow); } // Lower it the same way as we would lower a SELECT_CC node. ISD::CondCode CC; SDValue LHS, RHS; if (CCVal.getOpcode() == ISD::SETCC) { LHS = CCVal.getOperand(0); RHS = CCVal.getOperand(1); CC = cast(CCVal->getOperand(2))->get(); } else { LHS = CCVal; RHS = DAG.getConstant(0, DL, CCVal.getValueType()); CC = ISD::SETNE; } return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); } SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { return getAddrLarge(JT, DAG); } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { return getAddrTiny(JT, DAG); } return getAddr(JT, DAG); } SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. SDLoc DL(Op); SDValue JT = Op.getOperand(1); SDValue Entry = Op.getOperand(2); int JTI = cast(JT.getNode())->getIndex(); SDNode *Dest = DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0), SDValue(Dest, 0)); } SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { // Use the GOT for the large code model on iOS. if (Subtarget->isTargetMachO()) { return getGOT(CP, DAG); } return getAddrLarge(CP, DAG); } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { return getAddrTiny(CP, DAG); } else { return getAddr(CP, DAG); } } SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { BlockAddressSDNode *BA = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { return getAddrLarge(BA, DAG); } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { return getAddrTiny(BA, DAG); } return getAddr(BA, DAG); } SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const { AArch64FunctionInfo *FuncInfo = DAG.getMachineFunction().getInfo(); SDLoc DL(Op); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const { AArch64FunctionInfo *FuncInfo = DAG.getMachineFunction().getInfo(); SDLoc DL(Op); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 ? FuncInfo->getVarArgsGPRIndex() : FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const { // The layout of the va_list struct is specified in the AArch64 Procedure Call // Standard, section B.3. MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue VAList = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); SmallVector MemOps; // void *__stack at offset 0 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, MachinePointerInfo(SV), /* Alignment = */ 8)); // void *__gr_top at offset 8 int GPRSize = FuncInfo->getVarArgsGPRSize(); if (GPRSize > 0) { SDValue GRTop, GRTopAddr; GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, DAG.getConstant(GPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, MachinePointerInfo(SV, 8), /* Alignment = */ 8)); } // void *__vr_top at offset 16 int FPRSize = FuncInfo->getVarArgsFPRSize(); if (FPRSize > 0) { SDValue VRTop, VRTopAddr; VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(16, DL, PtrVT)); VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, DAG.getConstant(FPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, MachinePointerInfo(SV, 16), /* Alignment = */ 8)); } // int __gr_offs at offset 24 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); MemOps.push_back(DAG.getStore( Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, MachinePointerInfo(SV, 24), /* Alignment = */ 4)); // int __vr_offs at offset 28 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); MemOps.push_back(DAG.getStore( Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, MachinePointerInfo(SV, 28), /* Alignment = */ 4)); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) return LowerWin64_VASTART(Op, DAG); else if (Subtarget->isTargetDarwin()) return LowerDarwin_VASTART(Op, DAG); else return LowerAAPCS_VASTART(Op, DAG); } SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); unsigned VaListSize = Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; const Value *DestSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), DAG.getConstant(VaListSize, DL, MVT::i32), 8, false, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "automatic va_arg instruction only works on Darwin"); const Value *V = cast(Op.getOperand(2))->getValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); unsigned Align = Op.getConstantOperandVal(3); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V)); Chain = VAList.getValue(1); if (Align > 8) { assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, DAG.getConstant(-(int64_t)Align, DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); // Scalar integer and FP values smaller than 64 bits are implicitly extended // up to 64 bits. At the very least, we have to increase the striding of the // vaargs list to match this, and for FP values we need to introduce // FP_ROUND nodes as well. if (VT.isInteger() && !VT.isVector()) ArgSize = 8; bool NeedFPTrunc = false; if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { ArgSize = 8; NeedFPTrunc = true; } // Increment the pointer, VAList, to the next vaarg SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); // Load the actual argument out of the pointer VAList if (NeedFPTrunc) { // Load the value as an f64. SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); // Round the value down to an f32. SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), DAG.getIntPtrConstant(1, DL)); SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; // Merge the rounded value with the chain output of the load. return DAG.getMergeValues(Ops, DL); } return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); } SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); EVT VT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); int FI = MFI.CreateFixedObject(4, 0, false); return DAG.getFrameIndex(FI, VT); } #define GET_REGISTER_MATCHER #include "AArch64GenAsmMatcher.inc" // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = MatchRegisterName(RegName); if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); if (!Subtarget->isXRegisterReserved(DwarfRegNum)) Reg = 0; } if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" + StringRef(RegName) + "\".")); } SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const { DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset); } SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setReturnAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); } // Return LR, which contains the return address. Mark it an implicit live-in. unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which // is "undef". We wanted 0, so CSEL it directly. SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETEQ, dl, DAG); SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); HiBitsForLo = DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), HiBitsForLo, CCVal, Cmp); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); SDValue LoForNormalShift = DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, dl, DAG); CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, LoForNormalShift, CCVal, Cmp); // AArch64 shifts larger than the register width are wrapped rather than // clamped, so we can't just emit "hi >> x". SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); SDValue HiForBigShift = Opc == ISD::SRA ? DAG.getNode(Opc, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, dl, MVT::i64)) : DAG.getConstant(0, dl, VT); SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, HiForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which // is "undef". We wanted 0, so CSEL it directly. SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETEQ, dl, DAG); SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); LoBitsForHi = DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), LoBitsForHi, CCVal, Cmp); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); SDValue HiForNormalShift = DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, dl, DAG); CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, HiForNormalShift, CCVal, Cmp); // AArch64 shifts of larger than register sizes are wrapped rather than // clamped, so we can't just emit "lo << a" if a is too big. SDValue LoForBigShift = DAG.getConstant(0, dl, VT); SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, LoForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } bool AArch64TargetLowering::isOffsetFoldingLegal( const GlobalAddressSDNode *GA) const { // Offsets are folded in the DAG combine rather than here so that we can // intelligently choose an offset based on the uses. return false; } bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool OptForSize) const { bool IsLegal = false; // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and // 16-bit case when target has full fp16 support. // FIXME: We should be able to handle f128 as well with a clever lowering. const APInt ImmInt = Imm.bitcastToAPInt(); if (VT == MVT::f64) IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); else if (VT == MVT::f32) IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); else if (VT == MVT::f16 && Subtarget->hasFullFP16()) IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero(); // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to // generate that fmov. // If we can not materialize in immediate field for fmov, check if the // value can be encoded as the immediate operand of a logical instruction. // The immediate value will be created with either MOVZ, MOVN, or ORR. if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; // however the mov+fmov sequence is always better because of the reduced // cache pressure. The timings are still the same if you consider // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the // movw+movk is fused). So we limit up to 2 instrdduction at most. SmallVector Insn; AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn); unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); IsLegal = Insn.size() <= Limit; } LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString() << " imm value: "; Imm.dump();); return IsLegal; } //===----------------------------------------------------------------------===// // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps) { EVT VT = Operand.getValueType(); if (ST->hasNEON() && (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 || VT == MVT::v4f32)) { if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) // For the reciprocal estimates, convergence is quadratic, so the number // of digits is doubled after each iteration. In ARMv8, the accuracy of // the initial estimate is 2^-8. Thus the number of extra steps to refine // the result for float (23 mantissa bits) is 2 and for double (52 // mantissa bits) is 3. ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); } return SDValue(); } SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const { if (Enabled == ReciprocalEstimate::Enabled || (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, DAG, ExtraSteps)) { SDLoc DL(Operand); EVT VT = Operand.getValueType(); SDNodeFlags Flags; Flags.setAllowReassociation(true); // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, Flags); Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } if (!Reciprocal) { EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); // Correct the result if the operand is 0.0. Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, Eq, Operand, Estimate); } ExtraSteps = 0; return Estimate; } return SDValue(); } SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps) const { if (Enabled == ReciprocalEstimate::Enabled) if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, DAG, ExtraSteps)) { SDLoc DL(Operand); EVT VT = Operand.getValueType(); SDNodeFlags Flags; Flags.setAllowReassociation(true); // Newton reciprocal iteration: E * (2 - X * E) // AArch64 reciprocal iteration instruction: (2 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, Estimate, Flags); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } ExtraSteps = 0; return Estimate; } return SDValue(); } //===----------------------------------------------------------------------===// // AArch64 Inline Assembly Support //===----------------------------------------------------------------------===// // Table of Constraints // TODO: This is the current set of constraints supported by ARM for the // compiler, not all of them may make sense. // // r - A general register // w - An FP/SIMD register of some size in the range v0-v31 // x - An FP/SIMD register of some size in the range v0-v15 // I - Constant that can be used with an ADD instruction // J - Constant that can be used with a SUB instruction // K - Constant that can be used with a 32-bit logical instruction // L - Constant that can be used with a 64-bit logical instruction // M - Constant that can be used as a 32-bit MOV immediate // N - Constant that can be used as a 64-bit MOV immediate // Q - A memory reference with base register and no offset // S - A symbolic address // Y - Floating point constant zero // Z - Integer constant zero // // Note that general register operands will be output using their 64-bit x // register name, whatever the size of the variable, unless the asm operand // is prefixed by the %w modifier. Floating-point and SIMD register operands // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or // %q modifier. const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { // At this point, we have to lower this constraint to something else, so we // lower it to an "r" or "w". However, by doing this we will force the result // to be in register, while the X constraint is much more permissive. // // Although we are correct (we are free to emit anything, without // constraints), we might break use cases that would expect us to be more // efficient and emit something else. if (!Subtarget->hasFPARMv8()) return "r"; if (ConstraintVT.isFloatingPoint()) return "w"; if (ConstraintVT.isVector() && (ConstraintVT.getSizeInBits() == 64 || ConstraintVT.getSizeInBits() == 128)) return "w"; return "r"; } /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. AArch64TargetLowering::ConstraintType AArch64TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'x': case 'w': return C_RegisterClass; // An address with a single base register. Due to the way we // currently handle addresses it is the same as 'r'. case 'Q': return C_Memory; case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'Y': case 'Z': return C_Immediate; case 'z': case 'S': // A symbolic address return C_Other; } } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight AArch64TargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); break; case 'x': case 'w': if (type->isFloatingPointTy() || type->isVectorTy()) weight = CW_Register; break; case 'z': weight = CW_Constant; break; } return weight; } std::pair AArch64TargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': if (VT.getSizeInBits() == 64) return std::make_pair(0U, &AArch64::GPR64commonRegClass); return std::make_pair(0U, &AArch64::GPR32commonRegClass); case 'w': if (!Subtarget->hasFPARMv8()) break; if (VT.getSizeInBits() == 16) return std::make_pair(0U, &AArch64::FPR16RegClass); if (VT.getSizeInBits() == 32) return std::make_pair(0U, &AArch64::FPR32RegClass); if (VT.getSizeInBits() == 64) return std::make_pair(0U, &AArch64::FPR64RegClass); if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128RegClass); break; // The instructions that this constraint is designed for can // only take 128-bit registers so just use that regclass. case 'x': if (!Subtarget->hasFPARMv8()) break; if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128_loRegClass); break; } } if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { unsigned Size = Constraint.size(); if ((Size == 4 || Size == 5) && Constraint[0] == '{' && tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { int RegNo; bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); if (!Failed && RegNo >= 0 && RegNo <= 31) { // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. // By default we'll emit v0-v31 for this unless there's a modifier where // we'll emit the correct register as well. if (VT != MVT::Other && VT.getSizeInBits() == 64) { Res.first = AArch64::FPR64RegClass.getRegister(RegNo); Res.second = &AArch64::FPR64RegClass; } else { Res.first = AArch64::FPR128RegClass.getRegister(RegNo); Res.second = &AArch64::FPR128RegClass; } } } } if (Res.second && !Subtarget->hasFPARMv8() && !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) && !AArch64::GPR64allRegClass.hasSubClassEq(Res.second)) return std::make_pair(0U, nullptr); return Res; } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void AArch64TargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const { SDValue Result; // Currently only support length 1 constraints. if (Constraint.length() != 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; // This set of constraints deal with valid constants for various instructions. // Validate and return a target constant for them if we can. case 'z': { // 'z' maps to xzr or wzr so it needs an input of 0. if (!isNullConstant(Op)) return; if (Op.getValueType() == MVT::i64) Result = DAG.getRegister(AArch64::XZR, MVT::i64); else Result = DAG.getRegister(AArch64::WZR, MVT::i32); break; } case 'S': { // An absolute symbolic address or label reference. if (const GlobalAddressSDNode *GA = dyn_cast(Op)) { Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), GA->getValueType(0)); } else if (const BlockAddressSDNode *BA = dyn_cast(Op)) { Result = DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); } else if (const ExternalSymbolSDNode *ES = dyn_cast(Op)) { Result = DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0)); } else return; break; } case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': ConstantSDNode *C = dyn_cast(Op); if (!C) return; // Grab the value and do some validation. uint64_t CVal = C->getZExtValue(); switch (ConstraintLetter) { // The I constraint applies only to simple ADD or SUB immediate operands: // i.e. 0 to 4095 with optional shift by 12 // The J constraint applies only to ADD or SUB immediates that would be // valid when negated, i.e. if [an add pattern] were to be output as a SUB // instruction [or vice versa], in other words -1 to -4095 with optional // left shift by 12. case 'I': if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) break; return; case 'J': { uint64_t NVal = -C->getSExtValue(); if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { CVal = C->getSExtValue(); break; } return; } // The K and L constraints apply *only* to logical immediates, including // what used to be the MOVI alias for ORR (though the MOVI alias has now // been removed and MOV should be used). So these constraints have to // distinguish between bit patterns that are valid 32-bit or 64-bit // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice // versa. case 'K': if (AArch64_AM::isLogicalImmediate(CVal, 32)) break; return; case 'L': if (AArch64_AM::isLogicalImmediate(CVal, 64)) break; return; // The M and N constraints are a superset of K and L respectively, for use // with the MOV (immediate) alias. As well as the logical immediates they // also match 32 or 64-bit immediates that can be loaded either using a // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca // (M) or 64-bit 0x1234000000000000 (N) etc. // As a note some of this code is liberally stolen from the asm parser. case 'M': { if (!isUInt<32>(CVal)) return; if (AArch64_AM::isLogicalImmediate(CVal, 32)) break; if ((CVal & 0xFFFF) == CVal) break; if ((CVal & 0xFFFF0000ULL) == CVal) break; uint64_t NCVal = ~(uint32_t)CVal; if ((NCVal & 0xFFFFULL) == NCVal) break; if ((NCVal & 0xFFFF0000ULL) == NCVal) break; return; } case 'N': { if (AArch64_AM::isLogicalImmediate(CVal, 64)) break; if ((CVal & 0xFFFFULL) == CVal) break; if ((CVal & 0xFFFF0000ULL) == CVal) break; if ((CVal & 0xFFFF00000000ULL) == CVal) break; if ((CVal & 0xFFFF000000000000ULL) == CVal) break; uint64_t NCVal = ~CVal; if ((NCVal & 0xFFFFULL) == NCVal) break; if ((NCVal & 0xFFFF0000ULL) == NCVal) break; if ((NCVal & 0xFFFF00000000ULL) == NCVal) break; if ((NCVal & 0xFFFF000000000000ULL) == NCVal) break; return; } default: return; } // All assembler immediates are 64-bit integers. Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); break; } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } //===----------------------------------------------------------------------===// // AArch64 Advanced SIMD Support //===----------------------------------------------------------------------===// /// WidenVector - Given a value in the V64 register class, produce the /// equivalent value in the V128 register class. static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { EVT VT = V64Reg.getValueType(); unsigned NarrowSize = VT.getVectorNumElements(); MVT EltTy = VT.getVectorElementType().getSimpleVT(); MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); SDLoc DL(V64Reg); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), V64Reg, DAG.getConstant(0, DL, MVT::i32)); } /// getExtFactor - Determine the adjustment factor for the position when /// generating an "extract from vector registers" instruction. static unsigned getExtFactor(SDValue &V) { EVT EltType = V.getValueType().getVectorElementType(); return EltType.getSizeInBits() / 8; } /// NarrowVector - Given a value in the V128 register class, produce the /// equivalent value in the V64 register class. static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { EVT VT = V128Reg.getValueType(); unsigned WideSize = VT.getVectorNumElements(); MVT EltTy = VT.getVectorElementType().getSimpleVT(); MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); SDLoc DL(V128Reg); return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); } // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { SDValue Vec; unsigned MinElt; unsigned MaxElt; // We may insert some combination of BITCASTs and VEXT nodes to force Vec to // be compatible with the shuffle we intend to construct. As a result // ShuffleVec will be some sliding window into the original Vec. SDValue ShuffleVec; // Code should guarantee that element i in Vec starts at element "WindowBase // + i * WindowScale in ShuffleVec". int WindowBase; int WindowScale; ShuffleSourceInfo(SDValue Vec) : Vec(Vec), MinElt(std::numeric_limits::max()), MaxElt(0), ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } }; // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(V.getOperand(1))) { LLVM_DEBUG( dbgs() << "Reshuffle failed: " "a shuffle can only come from building a vector from " "various elements of other vectors, provided their " "indices are constant\n"); return SDValue(); } // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); auto Source = find(Sources, SourceVec); if (Source == Sources.end()) Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); // Update the minimum and maximum lane number seen. unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); Source->MinElt = std::min(Source->MinElt, EltNo); Source->MaxElt = std::max(Source->MaxElt, EltNo); } if (Sources.size() > 2) { LLVM_DEBUG( dbgs() << "Reshuffle failed: currently only do something sane when at " "most two source vectors are involved\n"); return SDValue(); } // Find out the smallest element size among result and two sources, and use // it as element size to build the shuffle_vector. EVT SmallestEltTy = VT.getVectorElementType(); for (auto &Source : Sources) { EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); if (SrcEltTy.bitsLT(SmallestEltTy)) { SmallestEltTy = SrcEltTy; } } unsigned ResMultiplier = VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); // If the source vector is too wide or too narrow, we may nevertheless be able // to construct a compatible shuffle either by concatenating it with UNDEF or // extracting a suitable range of elements. for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); if (SrcVT.getSizeInBits() == VT.getSizeInBits()) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); // We can pad out the smaller vector for free, so if it's part of a // shuffle... Src.ShuffleVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; } assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); if (Src.MaxElt - Src.MinElt >= NumSrcElts) { LLVM_DEBUG( dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); return SDValue(); } if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i64)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i64)); } else { // An actual VEXT is needed SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i64)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i64)); unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Imm, dl, MVT::i32)); Src.WindowBase = -Src.MinElt; } } // Another possible incompatibility occurs from the vector element types. We // can fix this by bitcasting the source vectors to the same type we intend // for the shuffle. for (auto &Src : Sources) { EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } // Final sanity check before we try to actually produce a shuffle. LLVM_DEBUG(for (auto Src : Sources) assert(Src.ShuffleVec.getValueType() == ShuffleVT);); // The stars all align, our next step is to produce the mask for the shuffle. SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); if (Entry.isUndef()) continue; auto Src = find(Sources, Entry.getOperand(0)); int EltNo = cast(Entry.getOperand(1))->getSExtValue(); // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); int BitsDefined = std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; // This source is expected to fill ResMultiplier lanes of the final shuffle, // starting at the appropriate offset. int *LaneMask = &Mask[i * ResMultiplier]; int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; ExtractBase += NumElts * (Src - Sources.begin()); for (int j = 0; j < LanesDefined; ++j) LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... if (!isShuffleMaskLegal(Mask, ShuffleVT)) { LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); return SDValue(); } SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; for (unsigned i = 0; i < Sources.size(); ++i) ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], ShuffleOps[1], Mask); SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); dbgs() << "Reshuffle, creating node: "; V.dump();); return V; } // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are the same. static bool isSingletonEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, just follow it // back to index zero and keep going. ++ExpectedElt; if (ExpectedElt == NumElts) ExpectedElt = 0; if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } return true; } // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are different. static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, unsigned &Imm) { // Look for the first non-undef element. const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); // Benefit form APInt to handle overflow when calculating expected element. unsigned NumElts = VT.getVectorNumElements(); unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); // The following shuffle indices must be the successive elements after the // first real element. const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); if (FirstWrongElt != M.end()) return false; // The index of an EXT is the first element if it is not UNDEF. // Watch out for the beginning UNDEFs. The EXT index should be the expected // value of the first element. E.g. // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. // ExpectedElt is the last mask index plus 1. Imm = ExpectedElt.getZExtValue(); // There are two difference cases requiring to reverse input vectors. // For example, for vector <4 x i32> we have the following cases, // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) // For both cases, we finally use mask <5, 6, 7, 0>, which requires // to reverse two input vectors. if (Imm < NumElts) ReverseEXT = true; else Imm -= NumElts; return true; } /// isREVMask - Check if a vector shuffle corresponds to a REV /// instruction with the specified blocksize. (The order of the elements /// within each block of the vector is reversed.) static bool isREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && "Only possible block sizes for REV are: 16, 32, 64"); unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); unsigned BlockElts = M[0] + 1; // If the first shuffle index is UNDEF, be optimistic. if (M[0] < 0) BlockElts = BlockSize / EltSz; if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) return false; for (unsigned i = 0; i < NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) return false; } return true; } static bool isZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != Idx) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) return false; Idx += 1; } return true; } static bool isUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i != NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned)M[i] != 2 * i + WhichResult) return false; } return true; } static bool isTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); if (NumElts % 2 != 0) return false; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) return false; } return true; } /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. static bool isZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); if (NumElts % 2 != 0) return false; WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != Idx) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) return false; Idx += 1; } return true; } /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned Half = VT.getVectorNumElements() / 2; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned j = 0; j != 2; ++j) { unsigned Idx = WhichResult; for (unsigned i = 0; i != Half; ++i) { int MIdx = M[i + j * Half]; if (MIdx >= 0 && (unsigned)MIdx != Idx) return false; Idx += 2; } } return true; } /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. static bool isTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); if (NumElts % 2 != 0) return false; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) return false; } return true; } static bool isINSMask(ArrayRef M, int NumInputElements, bool &DstIsLeft, int &Anomaly) { if (M.size() != static_cast(NumInputElements)) return false; int NumLHSMatch = 0, NumRHSMatch = 0; int LastLHSMismatch = -1, LastRHSMismatch = -1; for (int i = 0; i < NumInputElements; ++i) { if (M[i] == -1) { ++NumLHSMatch; ++NumRHSMatch; continue; } if (M[i] == i) ++NumLHSMatch; else LastLHSMismatch = i; if (M[i] == i + NumInputElements) ++NumRHSMatch; else LastRHSMismatch = i; } if (NumLHSMatch == NumInputElements - 1) { DstIsLeft = true; Anomaly = LastLHSMismatch; return true; } else if (NumRHSMatch == NumInputElements - 1) { DstIsLeft = false; Anomaly = LastRHSMismatch; return true; } return false; } static bool isConcatMask(ArrayRef Mask, EVT VT, bool SplitLHS) { if (VT.getSizeInBits() != 128) return false; unsigned NumElts = VT.getVectorNumElements(); for (int I = 0, E = NumElts / 2; I != E; I++) { if (Mask[I] != I) return false; } int Offset = NumElts / 2; for (int I = NumElts / 2, E = NumElts; I != E; I++) { if (Mask[I] != I + SplitLHS * Offset) return false; } return true; } static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue V0 = Op.getOperand(0); SDValue V1 = Op.getOperand(1); ArrayRef Mask = cast(Op)->getMask(); if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || VT.getVectorElementType() != V1.getValueType().getVectorElementType()) return SDValue(); bool SplitV0 = V0.getValueSizeInBits() == 128; if (!isConcatMask(Mask, VT, SplitV0)) return SDValue(); EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), VT.getVectorNumElements() / 2); if (SplitV0) { V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, DAG.getConstant(0, DL, MVT::i64)); } if (V1.getValueSizeInBits() == 128) { V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, DAG.getConstant(0, DL, MVT::i64)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); enum { OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> OP_VREV, OP_VDUP0, OP_VDUP1, OP_VDUP2, OP_VDUP3, OP_VEXT1, OP_VEXT2, OP_VEXT3, OP_VUZPL, // VUZP, left result OP_VUZPR, // VUZP, right result OP_VZIPL, // VZIP, left result OP_VZIPR, // VZIP, right result OP_VTRNL, // VTRN, left result OP_VTRNR // VTRN, right result }; if (OpNum == OP_COPY) { if (LHSID == (1 * 9 + 2) * 9 + 3) return LHS; assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); return RHS; } SDValue OpLHS, OpRHS; OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); EVT VT = OpLHS.getValueType(); switch (OpNum) { default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: // VREV divides the vector in half and swaps within the half. if (VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::f32) return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 if (VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::f16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: case OP_VDUP2: case OP_VDUP3: { EVT EltTy = VT.getVectorElementType(); unsigned Opcode; if (EltTy == MVT::i8) Opcode = AArch64ISD::DUPLANE8; else if (EltTy == MVT::i16 || EltTy == MVT::f16) Opcode = AArch64ISD::DUPLANE16; else if (EltTy == MVT::i32 || EltTy == MVT::f32) Opcode = AArch64ISD::DUPLANE32; else if (EltTy == MVT::i64 || EltTy == MVT::f64) Opcode = AArch64ISD::DUPLANE64; else llvm_unreachable("Invalid vector element type?"); if (VT.getSizeInBits() == 64) OpLHS = WidenVector(OpLHS, DAG); SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); } case OP_VEXT1: case OP_VEXT2: case OP_VEXT3: { unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, DAG.getConstant(Imm, dl, MVT::i32)); } case OP_VUZPL: return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VUZPR: return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VZIPL: return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VZIPR: return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VTRNL: return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VTRNR: return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); } } static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, SelectionDAG &DAG) { // Check to see if we can use the TBL instruction. SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc DL(Op); EVT EltVT = Op.getValueType().getVectorElementType(); unsigned BytesPerElt = EltVT.getSizeInBits() / 8; SmallVector TBLMask; for (int Val : ShuffleMask) { for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { unsigned Offset = Byte + Val * BytesPerElt; TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); } } MVT IndexVT = MVT::v8i8; unsigned IndexLen = 8; if (Op.getValueSizeInBits() == 128) { IndexVT = MVT::v16i8; IndexLen = 16; } SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); SDValue Shuffle; if (V2.getNode()->isUndef()) { if (IndexLen == 8) V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getBuildVector(IndexVT, DL, makeArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getBuildVector(IndexVT, DL, makeArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we // cannot currently represent the register constraints on the input // table registers. // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], // IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, V2Cst, DAG.getBuildVector(IndexVT, DL, makeArrayRef(TBLMask.data(), IndexLen))); } } return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); } static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; if (EltType == MVT::i16 || EltType == MVT::f16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; if (EltType == MVT::i64 || EltType == MVT::f64) return AArch64ISD::DUPLANE64; llvm_unreachable("Invalid vector element type?"); } SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again // during code selection. This is more efficient and avoids the possibility // of inconsistencies between legalization and selection. ArrayRef ShuffleMask = SVN->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), V1.getOperand(0)); // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- // constant. If so, we can just reference the lane's definition directly. if (V1.getOpcode() == ISD::BUILD_VECTOR && !isa(V1.getOperand(Lane))) return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); // Otherwise, duplicate from the lane of the input vector. unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); // SelectionDAGBuilder may have "helpfully" already extracted or conatenated // to make a vector of the same size as this SHUFFLE. We can ignore the // extract entirely, and canonicalise the concat using WidenVector. if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { Lane += cast(V1.getOperand(1))->getZExtValue(); V1 = V1.getOperand(0); } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; Lane -= Idx * VT.getVectorNumElements() / 2; V1 = WidenVector(V1.getOperand(Idx), DAG); } else if (VT.getSizeInBits() == 64) V1 = WidenVector(V1, DAG); return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); } if (isREVMask(ShuffleMask, VT, 64)) return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); if (isREVMask(ShuffleMask, VT, 32)) return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); if (isREVMask(ShuffleMask, VT, 16)) return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); bool ReverseEXT = false; unsigned Imm; if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { if (ReverseEXT) std::swap(V1, V2); Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } unsigned WhichResult; if (isZIPMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isUZPMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isTRNMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) return Concat; bool DstIsLeft; int Anomaly; int NumInputElements = V1.getValueType().getVectorNumElements(); if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { SDValue DstVec = DstIsLeft ? V1 : V2; SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); SDValue SrcVec = V1; int SrcLane = ShuffleMask[Anomaly]; if (SrcLane >= NumInputElements) { SrcVec = V2; SrcLane -= VT.getVectorNumElements(); } SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); EVT ScalarVT = VT.getVectorElementType(); if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) ScalarVT = MVT::i32; return DAG.getNode( ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), DstLaneV); } // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); if (NumElts == 4) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (ShuffleMask[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = ShuffleMask[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } return GenerateTBL(Op, ShuffleMask, DAG); } static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; for (unsigned i = 0; i < NumSplats; ++i) { CnstBits <<= SplatBitSize; UndefBits <<= SplatBitSize; CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); } return true; } return false; } // Try 64-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; if (AArch64_AM::isAdvSIMDModImmType10(Value)) { Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 32-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS = nullptr) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; bool isAdvSIMDModImm = false; uint64_t Shift; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); Shift = 0; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); Shift = 8; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); Shift = 16; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); Shift = 24; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov; if (LHS) Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); else Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 16-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS = nullptr) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; bool isAdvSIMDModImm = false; uint64_t Shift; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); Shift = 0; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); Shift = 8; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov; if (LHS) Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); else Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 32-bit splatted SIMD immediate with shifted ones. static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; bool isAdvSIMDModImm = false; uint64_t Shift; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType7(Value); Shift = 264; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType8(Value); Shift = 272; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 8-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; if (AArch64_AM::isAdvSIMDModImmType9(Value)) { Value = AArch64_AM::encodeAdvSIMDModImmType9(Value); SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try FP splatted SIMD immediate. static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); bool isWide = (VT.getSizeInBits() == 128); MVT MovTy; bool isAdvSIMDModImm = false; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType11(Value); MovTy = isWide ? MVT::v4f32 : MVT::v2f32; } else if (isWide && (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType12(Value); MovTy = MVT::v2f64; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Specialized code to quickly find if PotentialBVec is a BuildVector that // consists of only the same constant int value, returned in reference arg // ConstVal static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal) { BuildVectorSDNode *Bvec = dyn_cast(PotentialBVec); if (!Bvec) return false; ConstantSDNode *FirstElt = dyn_cast(Bvec->getOperand(0)); if (!FirstElt) return false; EVT VT = Bvec->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 1; i < NumElts; ++i) if (dyn_cast(Bvec->getOperand(i)) != FirstElt) return false; ConstVal = FirstElt->getZExtValue(); return true; } static unsigned getIntrinsicID(const SDNode *N) { unsigned Opcode = N->getOpcode(); switch (Opcode) { default: return Intrinsic::not_intrinsic; case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(N->getOperand(0))->getZExtValue(); if (IID < Intrinsic::num_intrinsics) return IID; return Intrinsic::not_intrinsic; } } } // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. // Also, logical shift right -> sri, with the same structure. static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); SDLoc DL(N); // Is the first op an AND? const SDValue And = N->getOperand(0); if (And.getOpcode() != ISD::AND) return SDValue(); // Is the second op an shl or lshr? SDValue Shift = N->getOperand(1); // This will have been turned into: AArch64ISD::VSHL vector, #shift // or AArch64ISD::VLSHR vector, #shift unsigned ShiftOpc = Shift.getOpcode(); if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) return SDValue(); bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; // Is the shift amount constant? ConstantSDNode *C2node = dyn_cast(Shift.getOperand(1)); if (!C2node) return SDValue(); // Is the and mask vector all constant? uint64_t C1; if (!isAllConstantBuildVector(And.getOperand(1), C1)) return SDValue(); // Is C1 == ~C2, taking into account how much one can shift elements of a // particular size? uint64_t C2 = C2node->getZExtValue(); unsigned ElemSizeInBits = VT.getScalarSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); unsigned ElemMask = (1 << ElemSizeInBits) - 1; if ((C1 & ElemMask) != (~C2 & ElemMask)) return SDValue(); SDValue X = And.getOperand(0); SDValue Y = Shift.getOperand(0); unsigned Intrin = IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; SDValue ResultSLI = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrin, DL, MVT::i32), X, Y, Shift.getOperand(1)); LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); LLVM_DEBUG(N->dump(&DAG)); LLVM_DEBUG(dbgs() << "into: \n"); LLVM_DEBUG(ResultSLI->dump(&DAG)); ++NumShiftInserts; return ResultSLI; } SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) if (EnableAArch64SlrGeneration) { if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) return Res; } EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getOperand(1).getNode()); if (!BVN) { // OR commutes, so try swapping the operands. LHS = Op.getOperand(1); BVN = dyn_cast(Op.getOperand(0).getNode()); } if (!BVN) return Op; APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); if (resolveBuildVector(BVN, DefBits, UndefBits)) { SDValue NewOp; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS))) return NewOp; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS))) return NewOp; } // We can always fall back to a non-immediate OR. return Op; } // Normalize the operands of BUILD_VECTOR. The value of constant operands will // be truncated to fit element width. static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); EVT EltTy= VT.getVectorElementType(); if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) return Op; SmallVector Ops; for (SDValue Lane : Op->ops()) { // For integer vectors, type legalization would have promoted the // operands already. Otherwise, if Op is a floating-point splat // (with operands cast to integers), then the only possibilities // are constants and UNDEFs. if (auto *CstLane = dyn_cast(Lane)) { APInt LowBits(EltTy.getSizeInBits(), CstLane->getZExtValue()); Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); } else if (Lane.getNode()->isUndef()) { Lane = DAG.getUNDEF(MVT::i32); } else { assert(Lane.getValueType() == MVT::i32 && "Unexpected BUILD_VECTOR operand type"); } Ops.push_back(Lane); } return DAG.getBuildVector(VT, dl, Ops); } static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); BuildVectorSDNode *BVN = cast(Op.getNode()); if (resolveBuildVector(BVN, DefBits, UndefBits)) { SDValue NewOp; if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) return NewOp; DefBits = ~DefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) return NewOp; DefBits = UndefBits; if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) return NewOp; DefBits = ~UndefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) return NewOp; } return SDValue(); } SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // Try to build a simple constant vector. Op = NormalizeBuildVector(Op, DAG); if (VT.isInteger()) { // Certain vector constants, used to express things like logical NOT and // arithmetic NEG, are passed through unmodified. This allows special // patterns for these operations to match, which will lower these constants // to whatever is proven necessary. BuildVectorSDNode *BVN = cast(Op.getNode()); if (BVN->isConstant()) if (ConstantSDNode *Const = BVN->getConstantSplatNode()) { unsigned BitSize = VT.getVectorElementType().getSizeInBits(); APInt Val(BitSize, Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue()); if (Val.isNullValue() || Val.isAllOnesValue()) return Op; } } if (SDValue V = ConstantBuildVector(Op, DAG)) return V; // Scan through the operands to find some interesting properties we can // exploit: // 1) If only one value is used, we can use a DUP, or // 2) if only the low element is not undef, we can just insert that, or // 3) if only one constant value is used (w/ some non-constant lanes), // we can splat the constant value into the whole vector then fill // in the non-constant lanes. // 4) FIXME: If different constant values are used, but we can intelligently // select the values we'll be overwriting for the non-constant // lanes such that we can directly materialize the vector // some other way (MOVI, e.g.), we can be sneaky. // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. SDLoc dl(Op); unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; bool usesOnlyOneConstantValue = true; bool isConstant = true; bool AllLanesExtractElt = true; unsigned NumConstantLanes = 0; SDValue Value; SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) AllLanesExtractElt = false; if (V.isUndef()) continue; if (i > 0) isOnlyLowElement = false; if (!isa(V) && !isa(V)) isConstant = false; if (isa(V) || isa(V)) { ++NumConstantLanes; if (!ConstantValue.getNode()) ConstantValue = V; else if (ConstantValue != V) usesOnlyOneConstantValue = false; } if (!Value.getNode()) Value = V; else if (V != Value) usesOnlyOneValue = false; } if (!Value.getNode()) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); return DAG.getUNDEF(VT); } // Convert BUILD_VECTOR where all elements but the lowest are undef into // SCALAR_TO_VECTOR, except for when we have a single-element constant vector // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. if (isOnlyLowElement && !(NumElts == 1 && isa(Value))) { LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " "SCALAR_TO_VECTOR node\n"); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); } if (AllLanesExtractElt) { SDNode *Vector = nullptr; bool Even = false; bool Odd = false; // Check whether the extract elements match the Even pattern <0,2,4,...> or // the Odd pattern <1,3,5,...>. for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); const SDNode *N = V.getNode(); if (!isa(N->getOperand(1))) break; SDValue N0 = N->getOperand(0); // All elements are extracted from the same vector. if (!Vector) { Vector = N0.getNode(); // Check that the type of EXTRACT_VECTOR_ELT matches the type of // BUILD_VECTOR. if (VT.getVectorElementType() != N0.getValueType().getVectorElementType()) break; } else if (Vector != N0.getNode()) { Odd = false; Even = false; break; } // Extracted values are either at Even indices <0,2,4,...> or at Odd // indices <1,3,5,...>. uint64_t Val = N->getConstantOperandVal(1); if (Val == 2 * i) { Even = true; continue; } if (Val - 1 == 2 * i) { Odd = true; continue; } // Something does not match: abort. Odd = false; Even = false; break; } if (Even || Odd) { SDValue LHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), DAG.getConstant(0, dl, MVT::i64)); SDValue RHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), DAG.getConstant(NumElts, dl, MVT::i64)); if (Even && !Odd) return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, RHS); if (Odd && !Even) return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, RHS); } } // Use DUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. if (usesOnlyOneValue) { if (!isConstant) { if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Value.getValueType() != VT) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); } // This is actually a DUPLANExx operation, which keeps everything vectory. SDValue Lane = Value.getOperand(1); Value = Value.getOperand(0); if (Value.getValueSizeInBits() == 64) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " "widening it\n"); Value = WidenVector(Value, DAG); } unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); return DAG.getNode(Opcode, dl, VT, Value, Lane); } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; EVT EltTy = VT.getVectorElementType(); assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && "Unsupported floating-point vector type"); LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " "BITCASTS, and try again\n"); MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; Val.dump();); Val = LowerBUILD_VECTOR(Val, DAG); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } } // If there was only one constant value used and for more than one lane, // start by splatting that value, then replace the non-constant lanes. This // is better than the default, which will perform a separate initialization // for each lane. if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { // Firstly, try to materialize the splat constant. SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), Val = ConstantBuildVector(Vec, DAG); if (!Val) { // Otherwise, materialize the constant and splat it. Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); DAG.ReplaceAllUsesWith(Vec.getNode(), &Val); } // Now insert the non-constant lanes. for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); if (!isa(V) && !isa(V)) // Note that type legalization likely mucked about with the VT of the // source operand, so we may have to convert it here before inserting. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); } return Val; } // This will generate a load from the constant pool. if (isConstant) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " "expansion\n"); return SDValue(); } // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; } // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's // scalar_to_vector for the elements followed by a shuffle (provided the // shuffle is valid for the target) and materialization element by element // on the stack followed by a load for everything else. if (!isConstant && !usesOnlyOneValue) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " "of INSERT_VECTOR_ELT\n"); SDValue Vec = DAG.getUNDEF(VT); SDValue Op0 = Op.getOperand(0); unsigned i = 0; // Use SCALAR_TO_VECTOR for lane zero to // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the // value is already in an S or D register, and we're forced to emit an // INSERT_SUBREG that we can't fold anywhere. // // We also allow types like i8 and i16 which are illegal scalar but legal // vector element types. After type-legalization the inserted value is // extended (i32) and it is safe to cast them to the vector type by ignoring // the upper bits of the lowest lane (e.g. v8i8, v4i16). if (!Op0.isUndef()) { LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); ++i; } LLVM_DEBUG(if (i < NumElts) dbgs() << "Creating nodes for the other vector elements:\n";); for (; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); } return Vec; } LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " "better alternative\n"); return SDValue(); } SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); ConstantSDNode *CI = dyn_cast(Op.getOperand(2)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform insertion by expanding the value // to a V128 type and perform the insertion on that. SDLoc DL(Op); SDValue WideVec = WidenVector(Op.getOperand(0), DAG); EVT WideTy = WideVec.getValueType(); SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, Op.getOperand(1), Op.getOperand(2)); // Re-narrow the resultant vector. return NarrowVector(Node, DAG); } SDValue AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); ConstantSDNode *CI = dyn_cast(Op.getOperand(1)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform extraction by expanding the value // to a V128 type and perform the extraction on that. SDLoc DL(Op); SDValue WideVec = WidenVector(Op.getOperand(0), DAG); EVT WideTy = WideVec.getValueType(); EVT ExtrTy = WideTy.getVectorElementType(); if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) ExtrTy = MVT::i32; // For extractions, we just return the result directly. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getOperand(0).getValueType(); SDLoc dl(Op); // Just in case... if (!VT.isVector()) return SDValue(); ConstantSDNode *Cst = dyn_cast(Op.getOperand(1)); if (!Cst) return SDValue(); unsigned Val = Cst->getZExtValue(); unsigned Size = Op.getValueSizeInBits(); // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. if (Val == 0) return Op; // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Val * VT.getScalarSizeInBits() == 64) return Op; return SDValue(); } bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (M[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = M[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return true; } bool DummyBool; int DummyInt; unsigned DummyUnsigned; return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || isEXTMask(M, VT, DummyBool, DummyUnsigned) || // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || isZIPMask(M, VT, DummyUnsigned) || isTRN_v_undef_Mask(M, VT, DummyUnsigned) || isUZP_v_undef_Mask(M, VT, DummyUnsigned) || isZIP_v_undef_Mask(M, VT, DummyUnsigned) || isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || isConcatMask(M, VT, VT.getSizeInBits() == 128)); } /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { // Ignore bit_converts. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, ElementBits) || SplatBitSize > ElementBits) return false; Cnt = SplatBits.getSExtValue(); return true; } /// isVShiftLImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift left operation. That value must be in the range: /// 0 <= Value < ElementBits for a left shift; or /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift right operation. The value must be in the range: /// 1 <= Value <= ElementBits for a right shift; or static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); int64_t Cnt; if (!Op.getOperand(1).getValueType().isVector()) return Op; unsigned EltSize = VT.getScalarSizeInBits(); switch (Op.getOpcode()) { default: llvm_unreachable("unexpected shift opcode"); case ISD::SHL: if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32), Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: // Right shift immediate if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; return DAG.getNode(Opc, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); } // Right shift register. Note, there is not a shift right register // instruction, but the shift left register instruction takes a signed // value, where negative numbers specify a right shift. unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl : Intrinsic::aarch64_neon_ushl; // negate the shift amount SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); SDValue NegShiftLeft = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), NegShift); return NegShiftLeft; } return SDValue(); } static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG) { EVT SrcVT = LHS.getValueType(); assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && "function only supposed to emit natural comparisons"); BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); APInt CnstBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); bool IsZero = IsCnst && (CnstBits == 0); if (SrcVT.getVectorElementType().isFloatingPoint()) { switch (CC) { default: return SDValue(); case AArch64CC::NE: { SDValue Fcmeq; if (IsZero) Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); else Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); } case AArch64CC::EQ: if (IsZero) return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); case AArch64CC::GE: if (IsZero) return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); case AArch64CC::GT: if (IsZero) return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); case AArch64CC::LS: if (IsZero) return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); case AArch64CC::LT: if (!NoNans) return SDValue(); // If we ignore NaNs then we can use to the MI implementation. LLVM_FALLTHROUGH; case AArch64CC::MI: if (IsZero) return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); } } switch (CC) { default: return SDValue(); case AArch64CC::NE: { SDValue Cmeq; if (IsZero) Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); else Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); } case AArch64CC::EQ: if (IsZero) return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); case AArch64CC::GE: if (IsZero) return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); case AArch64CC::GT: if (IsZero) return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); case AArch64CC::LE: if (IsZero) return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); case AArch64CC::LS: return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); case AArch64CC::LO: return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); case AArch64CC::LT: if (IsZero) return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); case AArch64CC::HI: return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); case AArch64CC::HS: return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); } } SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); SDLoc dl(Op); if (LHS.getValueType().getVectorElementType().isInteger()) { assert(LHS.getValueType() == RHS.getValueType()); AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); SDValue Cmp = EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); } const bool FullFP16 = static_cast(DAG.getSubtarget()).hasFullFP16(); // Make v4f16 (only) fcmp operations utilise vector instructions // v8f16 support will be a litle more complicated if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) { if (LHS.getValueType().getVectorNumElements() == 4) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); DAG.ReplaceAllUsesWith(Op, NewSetcc); CmpVT = MVT::v4i32; } else return SDValue(); } assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) || LHS.getValueType().getVectorElementType() != MVT::f128); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. AArch64CC::CondCode CC1, CC2; bool ShouldInvert; changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; SDValue Cmp = EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); if (!Cmp.getNode()) return SDValue(); if (CC2 != AArch64CC::AL) { SDValue Cmp2 = EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); if (!Cmp2.getNode()) return SDValue(); Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); } Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); if (ShouldInvert) Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); return Cmp; } static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG) { SDValue VecOp = ScalarOp.getOperand(0); auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, DAG.getConstant(0, DL, MVT::i64)); } SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); switch (Op.getOpcode()) { case ISD::VECREDUCE_ADD: return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); case ISD::VECREDUCE_SMAX: return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); case ISD::VECREDUCE_SMIN: return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); case ISD::VECREDUCE_UMAX: return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); case ISD::VECREDUCE_UMIN: return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); case ISD::VECREDUCE_FMAX: { assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), Op.getOperand(0)); } case ISD::VECREDUCE_FMIN: { assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), Op.getOperand(0)); } default: llvm_unreachable("Unhandled reduction"); } } SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = static_cast(DAG.getSubtarget()); if (!Subtarget.hasLSE()) return SDValue(); // LSE has an atomic load-add instruction, but not a load-sub. SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue RHS = Op.getOperand(2); AtomicSDNode *AN = cast(Op.getNode()); RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(), Op.getOperand(0), Op.getOperand(1), RHS, AN->getMemOperand()); } SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = static_cast(DAG.getSubtarget()); if (!Subtarget.hasLSE()) return SDValue(); // LSE has an atomic load-clear instruction, but not a load-and. SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue RHS = Op.getOperand(2); AtomicSDNode *AN = cast(Op.getNode()); RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(), Op.getOperand(0), Op.getOperand(1), RHS, AN->getMemOperand()); } SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0); const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask(); if (Subtarget->hasCustomCallingConv()) TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); Chain = DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), DAG.getRegisterMask(Mask), Chain.getValue(1)); // To match the actual intent better, we should read the output from X15 here // again (instead of potentially spilling it to the stack), but rereading Size // from X15 here doesn't work at -O0, since it thinks that X15 is undefined // here. Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); return Chain; } SDValue AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "Only Windows alloca probing supported"); SDLoc dl(Op); // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); unsigned Align = cast(Op.getOperand(2))->getZExtValue(); EVT VT = Node->getValueType(0); if (DAG.getMachineFunction().getFunction().hasFnAttribute( "no-stack-arg-probe")) { SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); } Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); } /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: case Intrinsic::aarch64_neon_ld1x2: case Intrinsic::aarch64_neon_ld1x3: case Intrinsic::aarch64_neon_ld1x4: case Intrinsic::aarch64_neon_ld2lane: case Intrinsic::aarch64_neon_ld3lane: case Intrinsic::aarch64_neon_ld4lane: case Intrinsic::aarch64_neon_ld2r: case Intrinsic::aarch64_neon_ld3r: case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align = 0; // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::aarch64_neon_st2: case Intrinsic::aarch64_neon_st3: case Intrinsic::aarch64_neon_st4: case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: case Intrinsic::aarch64_neon_st1x4: case Intrinsic::aarch64_neon_st2lane: case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. unsigned NumElts = 0; for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align = 0; // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } case Intrinsic::aarch64_ldaxp: case Intrinsic::aarch64_ldxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = 16; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; case Intrinsic::aarch64_stlxp: case Intrinsic::aarch64_stxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = 16; Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; default: break; } return false; } bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { // TODO: This may be worth removing. Check regression tests for diffs. if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) return false; // If we're reducing the load width in order to avoid having to use an extra // instruction to do extension then it's probably a good idea. if (ExtTy != ISD::NON_EXTLOAD) return true; // Don't reduce load width if it would prevent us from combining a shift into // the offset. MemSDNode *Mem = dyn_cast(Load); assert(Mem); const SDValue &Base = Mem->getBasePtr(); if (Base.getOpcode() == ISD::ADD && Base.getOperand(1).getOpcode() == ISD::SHL && Base.getOperand(1).hasOneUse() && Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { // The shift can be combined if it matches the size of the value being // loaded (and so reducing the width would make it not match). uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; if (ShiftAmount == Log2_32(LoadBytes)) return false; } // We have no reason to disallow reducing the load width, so allow it. return true; } // Truncations from 64-bit GPR to 32-bit GPR is free. bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 > NumBits2; } bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 > NumBits2; } /// Check if it is profitable to hoist instruction in then/else to if. /// Not profitable if I and it's user can form a FMA instruction /// because we prefer FMSUB/FMADD. bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { if (I->getOpcode() != Instruction::FMul) return true; if (!I->hasOneUse()) return true; Instruction *User = I->user_back(); if (User && !(User->getOpcode() == Instruction::FSub || User->getOpcode() == Instruction::FAdd)) return true; const TargetOptions &Options = getTargetMachine().Options; const DataLayout &DL = I->getModule()->getDataLayout(); EVT VT = getValueType(DL, User->getOperand(0)->getType()); return !(isFMAFasterThanFMulAndFAdd(VT) && isOperationLegalOrCustom(ISD::FMA, VT) && (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding // 64-bit GPR. bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { EVT VT1 = Val.getValueType(); if (isZExtFree(VT1, VT2)) { return true; } if (Val.getOpcode() != ISD::LOAD) return false; // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && VT1.getSizeInBits() <= 32); } bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { if (isa(Ext)) return false; // Vector types are not free. if (Ext->getType()->isVectorTy()) return false; for (const Use &U : Ext->uses()) { // The extension is free if we can fold it with a left shift in an // addressing mode or an arithmetic operation: add, sub, and cmp. // Is there a shift? const Instruction *Instr = cast(U.getUser()); // Is this a constant shift? switch (Instr->getOpcode()) { case Instruction::Shl: if (!isa(Instr->getOperand(1))) return false; break; case Instruction::GetElementPtr: { gep_type_iterator GTI = gep_type_begin(Instr); auto &DL = Ext->getModule()->getDataLayout(); std::advance(GTI, U.getOperandNo()-1); Type *IdxTy = GTI.getIndexedType(); // This extension will end up with a shift because of the scaling factor. // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. // Get the shift amount based on the scaling factor: // log2(sizeof(IdxTy)) - log2(8). uint64_t ShiftAmt = countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; // Is the constant foldable in the shift of the addressing mode? // I.e., shift amount is between 1 and 4 inclusive. if (ShiftAmt == 0 || ShiftAmt > 4) return false; break; } case Instruction::Trunc: // Check if this is a noop. // trunc(sext ty1 to ty2) to ty1. if (Instr->getType() == Ext->getOperand(0)->getType()) continue; LLVM_FALLTHROUGH; default: return false; } // At this point we can use the bfm family, so this extension is free // for that use. } return true; } /// Check if both Op1 and Op2 are shufflevector extracts of either the lower /// or upper half of the vector elements. static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { auto areTypesHalfed = [](Value *FullV, Value *HalfV) { auto *FullVT = cast(FullV->getType()); auto *HalfVT = cast(HalfV->getType()); return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth(); }; auto extractHalf = [](Value *FullV, Value *HalfV) { auto *FullVT = cast(FullV->getType()); auto *HalfVT = cast(HalfV->getType()); return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); }; Constant *M1, *M2; Value *S1Op1, *S2Op1; if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) || !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2)))) return false; // Check that the operands are half as wide as the result and we extract // half of the elements of the input vectors. if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) || !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2)) return false; // Check the mask extracts either the lower or upper half of vector // elements. int M1Start = -1; int M2Start = -1; int NumElements = cast(Op1->getType())->getNumElements() * 2; if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) return false; return true; } /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth /// of the vector elements. static bool areExtractExts(Value *Ext1, Value *Ext2) { auto areExtDoubled = [](Instruction *Ext) { return Ext->getType()->getScalarSizeInBits() == 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); }; if (!match(Ext1, m_ZExtOrSExt(m_Value())) || !match(Ext2, m_ZExtOrSExt(m_Value())) || !areExtDoubled(cast(Ext1)) || !areExtDoubled(cast(Ext2))) return false; return true; } /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). bool AArch64TargetLowering::shouldSinkOperands( Instruction *I, SmallVectorImpl &Ops) const { if (!I->getType()->isVectorTy()) return false; if (IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { case Intrinsic::aarch64_neon_umull: if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) return false; Ops.push_back(&II->getOperandUse(0)); Ops.push_back(&II->getOperandUse(1)); return true; default: return false; } } switch (I->getOpcode()) { case Instruction::Sub: case Instruction::Add: { if (!areExtractExts(I->getOperand(0), I->getOperand(1))) return false; // If the exts' operands extract either the lower or upper elements, we // can sink them too. auto Ext1 = cast(I->getOperand(0)); auto Ext2 = cast(I->getOperand(1)); if (areExtractShuffleVectors(Ext1, Ext2)) { Ops.push_back(&Ext1->getOperandUse(0)); Ops.push_back(&Ext2->getOperandUse(0)); } Ops.push_back(&I->getOperandUse(0)); Ops.push_back(&I->getOperandUse(1)); return true; } default: return false; } return false; } bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const { if (!LoadedType.isSimple() || (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) return false; // Cyclone supports unaligned accesses. RequiredAligment = 0; unsigned NumBits = LoadedType.getSizeInBits(); return NumBits == 32 || NumBits == 64; } /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. unsigned AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const { return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } MachineMemOperand::Flags AArch64TargetLowering::getMMOFlags(const Instruction &I) const { if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) return MOStridedAccess; return MachineMemOperand::MONone; } bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); // Ensure the number of vector elements is greater than 1. if (VecTy->getNumElements() < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) return false; // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. return VecSize == 64 || VecSize % 128 == 0; } /// Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements /// /// Into: /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isPointerTy()) VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); IRBuilder<> Builder(LI); // The base address of the load. Value *BaseAddr = LI->getPointerOperand(); if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. VecTy = VectorType::get(VecTy->getVectorElementType(), VecTy->getVectorNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, VecTy->getVectorElementType()->getPointerTo( LI->getPointerAddressSpace())); } Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[2] = {VecTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, Intrinsic::aarch64_neon_ld3, Intrinsic::aarch64_neon_ld4}; Function *LdNFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will // replace. DenseMap> SubVecs; for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) BaseAddr = Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, VecTy->getVectorNumElements() * Factor); CallInst *LdN = Builder.CreateCall( LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); // Extract and store the sub-vectors returned by the load intrinsic. for (unsigned i = 0; i < Shuffles.size(); i++) { ShuffleVectorInst *SVI = Shuffles[i]; unsigned Index = Indices[i]; Value *SubVec = Builder.CreateExtractValue(LdN, Index); // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( SubVec, VectorType::get(SVI->getType()->getVectorElementType(), VecTy->getVectorNumElements())); SubVecs[SVI].push_back(SubVec); } } // Replace uses of the shufflevector instructions with the sub-vectors // returned by the load intrinsic. If a shufflevector instruction is // associated with more than one sub-vector, those sub-vectors will be // concatenated into a single wide vector. for (ShuffleVectorInst *SVI : Shuffles) { auto &SubVec = SubVecs[SVI]; auto *WideVec = SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; SVI->replaceAllUsesWith(WideVec); } return true; } /// Lower an interleaved store into a stN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) /// /// Note that the new shufflevectors will be removed and we'll only generate one /// st3 instruction in CodeGen. /// /// Example for a more general valid mask (Factor 3). Lower: /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); VectorType *VecTy = SVI->getType(); assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); unsigned LaneLen = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); unsigned NumOpElts = Op0->getType()->getVectorNumElements(); // Convert to the corresponding integer vector. Type *IntVecTy = VectorType::get(IntTy, NumOpElts); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); SubVecTy = VectorType::get(IntTy, LaneLen); } // The base address of the store. Value *BaseAddr = SI->getPointerOperand(); if (NumStores > 1) { // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( SI->getPointerAddressSpace())); } auto Mask = SVI->getShuffleMask(); Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); Type *Tys[2] = {SubVecTy, PtrTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, Intrinsic::aarch64_neon_st3, Intrinsic::aarch64_neon_st4}; Function *StNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { SmallVector Ops; // Split the shufflevector operands into sub vectors for the new stN call. for (unsigned i = 0; i < Factor; i++) { unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { unsigned IdxJ = StoreCount * LaneLen * Factor + j; if (Mask[IdxJ * Factor + IdxI] >= 0) { StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; break; } } // Note: Filling undef gaps with random elements is ok, since // those elements were being written anyway (with undefs). // In the case of all undefs we're defaulting to using elems from 0 // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } } // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), BaseAddr, LaneLen * Factor); Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); Builder.CreateCall(StNFunc, Ops); } return true; } static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, unsigned AlignCheck) { return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && (DstAlign == 0 || DstAlign % AlignCheck == 0)); } EVT AArch64TargetLowering::getOptimalMemOpType( uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; // Only use AdvSIMD to implement memset of 32-byte and above. It would have // taken one instruction to materialize the v2i64 zero and one store (with // restrictive addressing mode). Just do i64 stores. bool IsSmallMemset = IsMemset && Size < 32; auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) return true; bool Fast; return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, &Fast) && Fast; }; if (CanUseNEON && IsMemset && !IsSmallMemset && AlignmentIsAcceptable(MVT::v2i64, 16)) return MVT::v2i64; if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) return MVT::f128; if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) return MVT::i64; if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) return MVT::i32; return MVT::Other; } // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits::min()) { LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed << ": avoid UB for INT64_MIN\n"); return false; } // Same encoding for add/sub, just flip the sign. Immed = std::abs(Immed); bool IsLegal = ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); LLVM_DEBUG(dbgs() << "Is " << Immed << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n"); return IsLegal; } // Integer comparisons are implemented with ADDS/SUBS, so the range of valid // immediates is the same as for an add or a sub. bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { return isLegalAddImmediate(Immed); } /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const { // AArch64 has five basic addressing modes: // reg // reg + 9-bit signed offset // reg + SIZE_IN_BYTES * 12-bit unsigned offset // reg1 + reg2 // reg + SIZE_IN_BYTES * reg // No global is ever allowed as a base. if (AM.BaseGV) return false; // No reg+reg+imm addressing. if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) return false; // check reg + imm case: // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 uint64_t NumBytes = 0; if (Ty->isSized()) { uint64_t NumBits = DL.getTypeSizeInBits(Ty); NumBytes = NumBits / 8; if (!isPowerOf2_64(NumBits)) NumBytes = 0; } if (!AM.Scale) { int64_t Offset = AM.BaseOffs; // 9-bit signed offset if (isInt<9>(Offset)) return true; // 12-bit unsigned offset unsigned shift = Log2_64(NumBytes); if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && // Must be a multiple of NumBytes (NumBytes is a power of 2) (Offset >> shift) << shift == Offset) return true; return false; } // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); } bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { // Consider splitting large offset of struct or array. return true; } int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // Scaling factors are not free at all. // Operands | Rt Latency // ------------------------------------------- // Rt, [Xn, Xm] | 4 // ------------------------------------------- // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 // Rt, [Xn, Wm, #imm] | if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 if // it is not equal to 0 or 1. return AM.Scale != 0 && AM.Scale != 1; return -1; } bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; default: break; } return false; } const MCPhysReg * AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { // LR is a callee-save register, but we must treat it as clobbered by any call // site. Hence we include LR in the scratch registers, which are in turn added // as implicit-defs for stackmaps and patchpoints. static const MCPhysReg ScratchRegs[] = { AArch64::X16, AArch64::X17, AArch64::LR, 0 }; return ScratchRegs; } bool AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const { N = N->getOperand(0).getNode(); EVT VT = N->getValueType(0); // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine // it with shift to let it be lowered to UBFX. if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && isa(N->getOperand(1))) { uint64_t TruncMask = N->getConstantOperandVal(1); if (isMask_64(TruncMask) && N->getOperand(0).getOpcode() == ISD::SRL && isa(N->getOperand(0)->getOperand(1))) return false; } return true; } bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0) return false; int64_t Val = Imm.getSExtValue(); if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) return true; if ((int64_t)Val < 0) Val = ~Val; if (BitSize == 32) Val &= (1LL << 32) - 1; unsigned LZ = countLeadingZeros((uint64_t)Val); unsigned Shift = (63 - LZ) / 16; // MOVZ is free so return true for one or fewer MOVK. return Shift < 3; } bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; return (Index == 0 || Index == ResVT.getVectorNumElements()); } /// Turn vector tests of the signbit in the form of: /// xor (sra X, elt_size(X)-1), -1 /// into: /// cmge X, X, #0 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!Subtarget->hasNEON() || !VT.isVector()) return SDValue(); // There must be a shift right algebraic before the xor, and the xor must be a // 'not' operation. SDValue Shift = N->getOperand(0); SDValue Ones = N->getOperand(1); if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || !ISD::isBuildVectorAllOnes(Ones.getNode())) return SDValue(); // The shift should be smearing the sign bit across each vector element. auto *ShiftAmt = dyn_cast(Shift.getOperand(1)); EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) return SDValue(); return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); } // Generate SUBS and CSEL for integer abs. static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) // and change it to SUB and CSEL. if (VT.isInteger() && N->getOpcode() == ISD::XOR && N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) if (ConstantSDNode *Y1C = dyn_cast(N1.getOperand(1))) if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0.getOperand(0)); // Generate SUBS & CSEL. SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), N0.getOperand(0), DAG.getConstant(0, DL, VT)); return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, DAG.getConstant(AArch64CC::PL, DL, MVT::i32), SDValue(Cmp.getNode(), 1)); } return SDValue(); } static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) return Cmp; return performIntegerAbsCombine(N, DAG); } SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV // fold (sdiv X, pow2) EVT VT = N->getValueType(0); if ((VT != MVT::i32 && VT != MVT::i64) || !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); unsigned Lg2 = Divisor.countTrailingZeros(); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); // Add (N0 < 0) ? Pow2 - 1 : 0; SDValue CCVal; SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); Created.push_back(Cmp.getNode()); Created.push_back(Add.getNode()); Created.push_back(CSel.getNode()); // Divide by pow2. SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. if (Divisor.isNonNegative()) return SRA; Created.push_back(SRA.getNode()); return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); // The below optimizations require a constant RHS. if (!isa(N->getOperand(1))) return SDValue(); ConstantSDNode *C = cast(N->getOperand(1)); const APInt &ConstValue = C->getAPIntValue(); // Multiplication of a power of two plus/minus one can be done more // cheaply as as shift+add/sub. For now, this is true unilaterally. If // future CPUs have a cheaper MADD instruction, this may need to be // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. // More aggressively, some multiplications N0 * C can be lowered to // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, // e.g. 6=3*2=(2+1)*2. // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 // which equals to (1+2)*16-(1+2). SDValue N0 = N->getOperand(0); // TrailingZeroes is used to test if the mul can be lowered to // shift+add+shift. unsigned TrailingZeroes = ConstValue.countTrailingZeros(); if (TrailingZeroes) { // Conservatively do not lower to shift+add+shift if the mul might be // folded into smul or umul. if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) || isZeroExtended(N0.getNode(), DAG))) return SDValue(); // Conservatively do not lower to shift+add+shift if the mul might be // folded into madd or msub. if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD || N->use_begin()->getOpcode() == ISD::SUB)) return SDValue(); } // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub // and shift+add+shift. APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); unsigned ShiftAmt, AddSubOpc; // Is the shifted value the LHS operand of the add/sub? bool ShiftValUseIsN0 = true; // Do we need to negate the result? bool NegateResult = false; if (ConstValue.isNonNegative()) { // (mul x, 2^N + 1) => (add (shl x, N), x) // (mul x, 2^N - 1) => (sub (shl x, N), x) // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) APInt SCVMinus1 = ShiftedConstValue - 1; APInt CVPlus1 = ConstValue + 1; if (SCVMinus1.isPowerOf2()) { ShiftAmt = SCVMinus1.logBase2(); AddSubOpc = ISD::ADD; } else if (CVPlus1.isPowerOf2()) { ShiftAmt = CVPlus1.logBase2(); AddSubOpc = ISD::SUB; } else return SDValue(); } else { // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) // (mul x, -(2^N + 1)) => - (add (shl x, N), x) APInt CVNegPlus1 = -ConstValue + 1; APInt CVNegMinus1 = -ConstValue - 1; if (CVNegPlus1.isPowerOf2()) { ShiftAmt = CVNegPlus1.logBase2(); AddSubOpc = ISD::SUB; ShiftValUseIsN0 = false; } else if (CVNegMinus1.isPowerOf2()) { ShiftAmt = CVNegMinus1.logBase2(); AddSubOpc = ISD::ADD; NegateResult = true; } else return SDValue(); } SDLoc DL(N); EVT VT = N->getValueType(0); SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShiftAmt, DL, MVT::i64)); SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0; SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal; SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1); assert(!(NegateResult && TrailingZeroes) && "NegateResult and TrailingZeroes cannot both be true for now."); // Negate the result. if (NegateResult) return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res); // Shift the result. if (TrailingZeroes) return DAG.getNode(ISD::SHL, DL, VT, Res, DAG.getConstant(TrailingZeroes, DL, MVT::i64)); return Res; } static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to // optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> // AND(VECTOR_CMP(x,y), constant2) // constant2 = UNARYOP(constant) // Early exit if this isn't a vector operation, the operand of the // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast(N->getOperand(0)->getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. SDLoc DL(N); EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, N->getOperand(0)->getOperand(0), MaskConst); SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); return Res; } return SDValue(); } static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; EVT VT = N->getValueType(0); if (VT != MVT::f32 && VT != MVT::f64) return SDValue(); // Only optimize when the source and destination types have the same width. if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) return SDValue(); // If the result of an integer load is only used by an integer-to-float // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile()) { LoadSDNode *LN0 = cast(N0); SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), LN0->getPointerInfo(), LN0->getAlignment(), LN0->getMemOperand()->getFlags()); // Make sure successors of the original load stay after it by updating them // to use the new Chain. DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); unsigned Opcode = (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; return DAG.getNode(Opcode, SDLoc(N), VT, Load); } return SDValue(); } /// Fold a floating-point multiply by power of two into floating-point to /// fixed-point conversion. static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); if (!N->getValueType(0).isSimple()) return SDValue(); SDValue Op = N->getOperand(0); if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); if (FloatBits != 32 && FloatBits != 64) return SDValue(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); if (IntBits != 16 && IntBits != 32 && IntBits != 64) return SDValue(); // Avoid conversions where iN is larger than the float (e.g., float -> i64). if (IntBits > FloatBits) return SDValue(); BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t Bits = IntBits == 64 ? 64 : 32; int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); if (C == -1 || C == 0 || C > Bits) return SDValue(); MVT ResTy; unsigned NumLanes = Op.getValueType().getVectorNumElements(); switch (NumLanes) { default: return SDValue(); case 2: ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; break; } if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) return SDValue(); assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) && "Illegal vector type after legalization"); SDLoc DL(N); bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs : Intrinsic::aarch64_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); // We can handle smaller integers by generating an extra trunc. if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); return FixConv; } /// Fold a floating-point divide by power of two into fixed-point to /// floating-point conversion. static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); unsigned Opc = Op->getOpcode(); if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || !Op.getOperand(0).getValueType().isSimple() || (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) return SDValue(); SDValue ConstVec = N->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); int32_t IntBits = IntTy.getSizeInBits(); if (IntBits != 16 && IntBits != 32 && IntBits != 64) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); int32_t FloatBits = FloatTy.getSizeInBits(); if (FloatBits != 32 && FloatBits != 64) return SDValue(); // Avoid conversions where iN is larger than the float (e.g., i64 -> float). if (IntBits > FloatBits) return SDValue(); BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); if (C == -1 || C == 0 || C > FloatBits) return SDValue(); MVT ResTy; unsigned NumLanes = Op.getValueType().getVectorNumElements(); switch (NumLanes) { default: return SDValue(); case 2: ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; break; } if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) return SDValue(); SDLoc DL(N); SDValue ConvInput = Op.getOperand(0); bool IsSigned = Opc == ISD::SINT_TO_FP; if (IntBits < FloatBits) ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, ResTy, ConvInput); unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp : Intrinsic::aarch64_neon_vcvtfxu2fp; return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, DAG.getConstant(C, DL, MVT::i32)); } /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, bool &FromHi) { if (N.getOpcode() == ISD::SHL) FromHi = false; else if (N.getOpcode() == ISD::SRL) FromHi = true; else return false; if (!isa(N.getOperand(1))) return false; ShiftAmount = N->getConstantOperandVal(1); Src = N->getOperand(0); return true; } /// EXTR instruction extracts a contiguous chunk of bits from two existing /// registers viewed as a high/low pair. This function looks for the pattern: /// (or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N)) and replaces it /// with an EXTR. Can't quite be done in TableGen because the two immediates /// aren't independent. static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); EVT VT = N->getValueType(0); assert(N->getOpcode() == ISD::OR && "Unexpected root"); if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); SDValue LHS; uint32_t ShiftLHS = 0; bool LHSFromHi = false; if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) return SDValue(); SDValue RHS; uint32_t ShiftRHS = 0; bool RHSFromHi = false; if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) return SDValue(); // If they're both trying to come from the high part of the register, they're // not really an EXTR. if (LHSFromHi == RHSFromHi) return SDValue(); if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) return SDValue(); if (LHSFromHi) { std::swap(LHS, RHS); std::swap(ShiftLHS, ShiftRHS); } return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, DAG.getConstant(ShiftRHS, DL, MVT::i64)); } static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); if (!VT.isVector()) return SDValue(); SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::AND) return SDValue(); SDValue N1 = N->getOperand(1); if (N1.getOpcode() != ISD::AND) return SDValue(); // We only have to look for constant vectors here since the general, variable // case can be handled in TableGen. unsigned Bits = VT.getScalarSizeInBits(); uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); for (int i = 1; i >= 0; --i) for (int j = 1; j >= 0; --j) { BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(i)); BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(j)); if (!BVN0 || !BVN1) continue; bool FoundMatch = true; for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { ConstantSDNode *CN0 = dyn_cast(BVN0->getOperand(k)); ConstantSDNode *CN1 = dyn_cast(BVN1->getOperand(k)); if (!CN0 || !CN1 || CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { FoundMatch = false; break; } } if (FoundMatch) return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } return SDValue(); } static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); if (SDValue Res = tryCombineToEXTR(N, DCI)) return Res; if (SDValue Res = tryCombineToBSL(N, DCI)) return Res; return SDValue(); } static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); EVT VT = N->getValueType(0); if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1).getNode()); if (!BVN) return SDValue(); // AND does not accept an immediate, so check if we can use a BIC immediate // instruction instead. We do this here instead of using a (and x, (mvni imm)) // pattern in isel, because some immediates may be lowered to the preferred // (and x, (movi imm)) form, even though an mvni representation also exists. APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); if (resolveBuildVector(BVN, DefBits, UndefBits)) { SDValue NewOp; DefBits = ~DefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, DefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, DefBits, &LHS))) return NewOp; UndefBits = ~UndefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, UndefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, UndefBits, &LHS))) return NewOp; } return SDValue(); } static SDValue performSRLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. SDValue N0 = N->getOperand(0); if (N0.getOpcode() == ISD::BSWAP) { SDLoc DL(N); SDValue N1 = N->getOperand(1); SDValue N00 = N0.getOperand(0); if (ConstantSDNode *C = dyn_cast(N1)) { uint64_t ShiftAmt = C->getZExtValue(); if (VT == MVT::i32 && ShiftAmt == 16 && DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); if (VT == MVT::i64 && ShiftAmt == 32 && DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); } } return SDValue(); } static SDValue performBitcastCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // Remove extraneous bitcasts around an extract_subvector. // For example, // (v4i16 (bitconvert // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) // becomes // (extract_subvector ((v8i16 ...), (i64 4))) // Only interested in 64-bit vectors as the ultimate result. EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); if (VT.getSimpleVT().getSizeInBits() != 64) return SDValue(); // Is the operand an extract_subvector starting at the beginning or halfway // point of the vector? A low half may also come through as an // EXTRACT_SUBREG, so look for that, too. SDValue Op0 = N->getOperand(0); if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && !(Op0->isMachineOpcode() && Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) return SDValue(); uint64_t idx = cast(Op0->getOperand(1))->getZExtValue(); if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) return SDValue(); } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { if (idx != AArch64::dsub) return SDValue(); // The dsub reference is equivalent to a lane zero subvector reference. idx = 0; } // Look through the bitcast of the input to the extract. if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) return SDValue(); SDValue Source = Op0->getOperand(0)->getOperand(0); // If the source type has twice the number of elements as our destination // type, we know this is an extract of the high or low half of the vector. EVT SVT = Source->getValueType(0); if (!SVT.isVector() || SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) return SDValue(); LLVM_DEBUG( dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); // Create the simplified form to just extract the low or high half of the // vector directly rather than bothering with the bitcasts. SDLoc dl(N); unsigned NumElements = VT.getVectorNumElements(); if (idx) { SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); } else { SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, Source, SubReg), 0); } } static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); // Optimize concat_vectors of truncated vectors, where the intermediate // type is illegal, to avoid said illegality, e.g., // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), // (v2i16 (truncate (v2i64))))) // -> // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), // (v4i32 (bitcast (v2i64))), // <0, 2, 4, 6>))) // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed // on both input and result type, so we might generate worse code. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. if (N->getNumOperands() == 2 && N0->getOpcode() == ISD::TRUNCATE && N1->getOpcode() == ISD::TRUNCATE) { SDValue N00 = N0->getOperand(0); SDValue N10 = N1->getOperand(0); EVT N00VT = N00.getValueType(); if (N00VT == N10.getValueType() && (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); SmallVector Mask(MidVT.getVectorNumElements()); for (size_t i = 0; i < Mask.size(); ++i) Mask[i] = i * 2; return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getVectorShuffle( MidVT, dl, DAG.getNode(ISD::BITCAST, dl, MidVT, N00), DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); } } // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. if (N0 == N1 && VT.getVectorNumElements() == 2) { assert(VT.getScalarSizeInBits() == 64); return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), DAG.getConstant(0, dl, MVT::i64)); } // Canonicalise concat_vectors so that the right-hand vector has as few // bit-casts as possible before its real operation. The primary matching // destination for these operations will be the narrowing "2" instructions, // which depend on the operation being performed on this right-hand vector. // For example, // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) if (N1->getOpcode() != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); // If the RHS is not a vector, this is not the pattern we're looking for. if (!RHSTy.isVector()) return SDValue(); LLVM_DEBUG( dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), RHSTy.getVectorNumElements() * 2); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), RHS)); } static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // Wait until after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // Transform a scalar conversion of a value from a lane extract into a // lane extract of a vector conversion. E.g., from foo1 to foo2: // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } // // The second form interacts better with instruction selection and the // register allocator to avoid cross-class register copies that aren't // coalescable due to a lane reference. // Check the operand and see if it originates from a lane extract. SDValue Op1 = N->getOperand(1); if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { // Yep, no additional predication needed. Perform the transform. SDValue IID = N->getOperand(0); SDValue Shift = N->getOperand(2); SDValue Vec = Op1.getOperand(0); SDValue Lane = Op1.getOperand(1); EVT ResTy = N->getValueType(0); EVT VecResTy; SDLoc DL(N); // The vector width should be 128 bits by the time we get here, even // if it started as 64 bits (the extract_vector handling will have // done so). assert(Vec.getValueSizeInBits() == 128 && "unexpected vector size on extract_vector_elt!"); if (Vec.getValueType() == MVT::v4i32) VecResTy = MVT::v4f32; else if (Vec.getValueType() == MVT::v2i64) VecResTy = MVT::v2f64; else llvm_unreachable("unexpected vector type!"); SDValue Convert = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); } return SDValue(); } // AArch64 high-vector "long" operations are formed by performing the non-high // version on an extract_subvector of each operand which gets the high half: // // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) // // However, there are cases which don't have an extract_high explicitly, but // have another operation that can be made compatible with one for free. For // example: // // (dupv64 scalar) --> (extract_high (dup128 scalar)) // // This routine does the actual conversion of such DUPs, once outer routines // have determined that everything else is in order. // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold // similarly here. static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { switch (N.getOpcode()) { case AArch64ISD::DUP: case AArch64ISD::DUPLANE8: case AArch64ISD::DUPLANE16: case AArch64ISD::DUPLANE32: case AArch64ISD::DUPLANE64: case AArch64ISD::MOVI: case AArch64ISD::MOVIshift: case AArch64ISD::MOVIedit: case AArch64ISD::MOVImsl: case AArch64ISD::MVNIshift: case AArch64ISD::MVNImsl: break; default: // FMOV could be supported, but isn't very useful, as it would only occur // if you passed a bitcast' floating point immediate to an eligible long // integer op (addl, smull, ...). return SDValue(); } MVT NarrowTy = N.getSimpleValueType(); if (!NarrowTy.is64BitVector()) return SDValue(); MVT ElementTy = NarrowTy.getVectorElementType(); unsigned NumElems = NarrowTy.getVectorNumElements(); MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); SDLoc dl(N); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), DAG.getConstant(NumElems, dl, MVT::i64)); } static bool isEssentiallyExtractHighSubvector(SDValue N) { if (N.getOpcode() == ISD::BITCAST) N = N.getOperand(0); if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR) return false; return cast(N.getOperand(1))->getAPIntValue() == N.getOperand(0).getValueType().getVectorNumElements() / 2; } /// Helper structure to keep track of ISD::SET_CC operands. struct GenericSetCCInfo { const SDValue *Opnd0; const SDValue *Opnd1; ISD::CondCode CC; }; /// Helper structure to keep track of a SET_CC lowered into AArch64 code. struct AArch64SetCCInfo { const SDValue *Cmp; AArch64CC::CondCode CC; }; /// Helper structure to keep track of SetCC information. union SetCCInfo { GenericSetCCInfo Generic; AArch64SetCCInfo AArch64; }; /// Helper structure to be able to read SetCC information. If set to /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a /// GenericSetCCInfo. struct SetCCInfoAndKind { SetCCInfo Info; bool IsAArch64; }; /// Check whether or not \p Op is a SET_CC operation, either a generic or /// an /// AArch64 lowered one. /// \p SetCCInfo is filled accordingly. /// \post SetCCInfo is meanginfull only when this function returns true. /// \return True when Op is a kind of SET_CC operation. static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { // If this is a setcc, this is straight forward. if (Op.getOpcode() == ISD::SETCC) { SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); SetCCInfo.Info.Generic.CC = cast(Op.getOperand(2))->get(); SetCCInfo.IsAArch64 = false; return true; } // Otherwise, check if this is a matching csel instruction. // In other words: // - csel 1, 0, cc // - csel 0, 1, !cc if (Op.getOpcode() != AArch64ISD::CSEL) return false; // Set the information about the operands. // TODO: we want the operands of the Cmp not the csel SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); SetCCInfo.IsAArch64 = true; SetCCInfo.Info.AArch64.CC = static_cast( cast(Op.getOperand(2))->getZExtValue()); // Check that the operands matches the constraints: // (1) Both operands must be constants. // (2) One must be 1 and the other must be 0. ConstantSDNode *TValue = dyn_cast(Op.getOperand(0)); ConstantSDNode *FValue = dyn_cast(Op.getOperand(1)); // Check (1). if (!TValue || !FValue) return false; // Check (2). if (!TValue->isOne()) { // Update the comparison when we are interested in !cc. std::swap(TValue, FValue); SetCCInfo.Info.AArch64.CC = AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); } return TValue->isOne() && FValue->isNullValue(); } // Returns true if Op is setcc or zext of setcc. static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { if (isSetCC(Op, Info)) return true; return ((Op.getOpcode() == ISD::ZERO_EXTEND) && isSetCC(Op->getOperand(0), Info)); } // The folding we want to perform is: // (add x, [zext] (setcc cc ...) ) // --> // (csel x, (add x, 1), !cc ...) // // The latter will get matched to a CSINC instruction. static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); SDValue LHS = Op->getOperand(0); SDValue RHS = Op->getOperand(1); SetCCInfoAndKind InfoAndKind; // If neither operand is a SET_CC, give up. if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { std::swap(LHS, RHS); if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) return SDValue(); } // FIXME: This could be generatized to work for FP comparisons. EVT CmpVT = InfoAndKind.IsAArch64 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() : InfoAndKind.Info.Generic.Opnd0->getValueType(); if (CmpVT != MVT::i32 && CmpVT != MVT::i64) return SDValue(); SDValue CCVal; SDValue Cmp; SDLoc dl(Op); if (InfoAndKind.IsAArch64) { CCVal = DAG.getConstant( AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, MVT::i32); Cmp = *InfoAndKind.Info.AArch64.Cmp; } else Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1, ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), CCVal, DAG, dl); EVT VT = Op->getValueType(0); LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); } // The basic add/sub long vector instructions have variants with "2" on the end // which act on the high-half of their inputs. They are normally matched by // patterns like: // // (add (zeroext (extract_high LHS)), // (zeroext (extract_high RHS))) // -> uaddl2 vD, vN, vM // // However, if one of the extracts is something like a duplicate, this // instruction can still be used profitably. This function puts the DAG into a // more appropriate form for those patterns to trigger. static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); MVT VT = N->getSimpleValueType(0); if (!VT.is128BitVector()) { if (N->getOpcode() == ISD::ADD) return performSetccAddFolding(N, DAG); return SDValue(); } // Make sure both branches are extended in the same way. SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if ((LHS.getOpcode() != ISD::ZERO_EXTEND && LHS.getOpcode() != ISD::SIGN_EXTEND) || LHS.getOpcode() != RHS.getOpcode()) return SDValue(); unsigned ExtType = LHS.getOpcode(); // It's not worth doing if at least one of the inputs isn't already an // extract, but we don't know which it'll be so we have to try both. if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) { RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); if (!RHS.getNode()) return SDValue(); RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) { LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); if (!LHS.getNode()) return SDValue(); LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); } return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); } // Massage DAGs which we can use the high-half "long" operations on into // something isel will recognize better. E.g. // // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> // (aarch64_neon_umull (extract_high (v2i64 vec))) // (extract_high (v2i64 (dup128 scalar))))) // static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); assert(LHS.getValueType().is64BitVector() && RHS.getValueType().is64BitVector() && "unexpected shape for long operation"); // Either node could be a DUP, but it's not worth doing both of them (you'd // just as well use the non-high version) so look for a corresponding extract // operation on the other "wing". if (isEssentiallyExtractHighSubvector(LHS)) { RHS = tryExtendDUPToExtractHigh(RHS, DAG); if (!RHS.getNode()) return SDValue(); } else if (isEssentiallyExtractHighSubvector(RHS)) { LHS = tryExtendDUPToExtractHigh(LHS, DAG); if (!LHS.getNode()) return SDValue(); } return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), N->getOperand(0), LHS, RHS); } static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { MVT ElemTy = N->getSimpleValueType(0).getScalarType(); unsigned ElemBits = ElemTy.getSizeInBits(); int64_t ShiftAmount; if (BuildVectorSDNode *BVN = dyn_cast(N->getOperand(2))) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, ElemBits) || SplatBitSize != ElemBits) return SDValue(); ShiftAmount = SplatValue.getSExtValue(); } else if (ConstantSDNode *CVN = dyn_cast(N->getOperand(2))) { ShiftAmount = CVN->getSExtValue(); } else return SDValue(); unsigned Opcode; bool IsRightShift; switch (IID) { default: llvm_unreachable("Unknown shift intrinsic"); case Intrinsic::aarch64_neon_sqshl: Opcode = AArch64ISD::SQSHL_I; IsRightShift = false; break; case Intrinsic::aarch64_neon_uqshl: Opcode = AArch64ISD::UQSHL_I; IsRightShift = false; break; case Intrinsic::aarch64_neon_srshl: Opcode = AArch64ISD::SRSHR_I; IsRightShift = true; break; case Intrinsic::aarch64_neon_urshl: Opcode = AArch64ISD::URSHR_I; IsRightShift = true; break; case Intrinsic::aarch64_neon_sqshlu: Opcode = AArch64ISD::SQSHLU_I; IsRightShift = false; break; } if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { SDLoc dl(N); return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), DAG.getConstant(-ShiftAmount, dl, MVT::i32)); } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { SDLoc dl(N); return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), DAG.getConstant(ShiftAmount, dl, MVT::i32)); } return SDValue(); } // The CRC32[BH] instructions ignore the high bits of their data operand. Since // the intrinsics must be legal and take an i32, this means there's almost // certainly going to be a zext in the DAG which we can eliminate. static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { SDValue AndN = N->getOperand(2); if (AndN.getOpcode() != ISD::AND) return SDValue(); ConstantSDNode *CMask = dyn_cast(AndN.getOperand(1)); if (!CMask || CMask->getZExtValue() != Mask) return SDValue(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); } static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), DAG.getNode(Opc, dl, N->getOperand(1).getSimpleValueType(), N->getOperand(1)), DAG.getConstant(0, dl, MVT::i64)); } static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; unsigned IID = getIntrinsicID(N); switch (IID) { default: break; case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); case Intrinsic::aarch64_neon_saddv: return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); case Intrinsic::aarch64_neon_uaddv: return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); case Intrinsic::aarch64_neon_sminv: return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); case Intrinsic::aarch64_neon_uminv: return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); case Intrinsic::aarch64_neon_smaxv: return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); case Intrinsic::aarch64_neon_umaxv: return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmin: return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmaxnm: return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fminnm: return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: case Intrinsic::aarch64_neon_pmull: case Intrinsic::aarch64_neon_sqdmull: return tryCombineLongOpWithDup(IID, N, DCI, DAG); case Intrinsic::aarch64_neon_sqshl: case Intrinsic::aarch64_neon_uqshl: case Intrinsic::aarch64_neon_sqshlu: case Intrinsic::aarch64_neon_srshl: case Intrinsic::aarch64_neon_urshl: return tryCombineShiftImm(IID, N, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: return tryCombineCRC32(0xff, N, DAG); case Intrinsic::aarch64_crc32h: case Intrinsic::aarch64_crc32ch: return tryCombineCRC32(0xffff, N, DAG); } return SDValue(); } static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then // we can convert that DUP into another extract_high (of a bigger DUP), which // helps the backend to decide that an sabdl2 would be useful, saving a real // extract_high operation. if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { SDNode *ABDNode = N->getOperand(0).getNode(); unsigned IID = getIntrinsicID(ABDNode); if (IID == Intrinsic::aarch64_neon_sabd || IID == Intrinsic::aarch64_neon_uabd) { SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); if (!NewABD.getNode()) return SDValue(); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } } // This is effectively a custom type legalization for AArch64. // // Type legalization will split an extend of a small, legal, type to a larger // illegal type by first splitting the destination type, often creating // illegal source types, which then get legalized in isel-confusing ways, // leading to really terrible codegen. E.g., // %result = v8i32 sext v8i8 %value // becomes // %losrc = extract_subreg %value, ... // %hisrc = extract_subreg %value, ... // %lo = v4i32 sext v4i8 %losrc // %hi = v4i32 sext v4i8 %hisrc // Things go rapidly downhill from there. // // For AArch64, the [sz]ext vector instructions can only go up one element // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 // take two instructions. // // This implies that the most efficient way to do the extend from v8i8 // to two v4i32 values is to first extend the v8i8 to v8i16, then do // the normal splitting to happen for the v8i16->v8i32. // This is pre-legalization to catch some cases where the default // type legalization will create ill-tempered code. if (!DCI.isBeforeLegalizeOps()) return SDValue(); // We're only interested in cleaning things up for non-legal vector types // here. If both the source and destination are legal, things will just // work naturally without any fiddling. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT ResVT = N->getValueType(0); if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) return SDValue(); // If the vector type isn't a simple VT, it's beyond the scope of what // we're worried about here. Let legalization do its thing and hope for // the best. SDValue Src = N->getOperand(0); EVT SrcVT = Src->getValueType(0); if (!ResVT.isSimple() || !SrcVT.isSimple()) return SDValue(); // If the source VT is a 64-bit vector, we can play games and get the // better results we want. if (SrcVT.getSizeInBits() != 64) return SDValue(); unsigned SrcEltSize = SrcVT.getScalarSizeInBits(); unsigned ElementCount = SrcVT.getVectorNumElements(); SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); SDLoc DL(N); Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); // Now split the rest of the operation into two halves, each with a 64 // bit source. EVT LoVT, HiVT; SDValue Lo, Hi; unsigned NumElements = ResVT.getVectorNumElements(); assert(!(NumElements & 1) && "Splitting vector, but not in half!"); LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(), NumElements / 2); EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), LoVT.getVectorNumElements()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, DAG.getConstant(0, DL, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); // Now combine the parts back together so we still have a single result // like the combiner expects. return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); } static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts) { assert(!St.isTruncatingStore() && "cannot split truncating vector store"); unsigned OrigAlignment = St.getAlignment(); unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; // Create scalar stores. This is at least as good as the code sequence for a // split unaligned store which is a dup.s, ext.b, and two stores. // Most of the time the three stores should be replaced by store pair // instructions (stp). SDLoc DL(&St); SDValue BasePtr = St.getBasePtr(); uint64_t BaseOffset = 0; const MachinePointerInfo &PtrInfo = St.getPointerInfo(); SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, OrigAlignment, St.getMemOperand()->getFlags()); // As this in ISel, we will not merge this add which may degrade results. if (BasePtr->getOpcode() == ISD::ADD && isa(BasePtr->getOperand(1))) { BaseOffset = cast(BasePtr->getOperand(1))->getSExtValue(); BasePtr = BasePtr->getOperand(0); } unsigned Offset = EltOffset; while (--NumVecElts) { unsigned Alignment = MinAlign(OrigAlignment, Offset); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, PtrInfo.getWithOffset(Offset), Alignment, St.getMemOperand()->getFlags()); Offset += EltOffset; } return NewST1; } /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The /// load store optimizer pass will merge them to store pair stores. This should /// be better than a movi to create the vector zero followed by a vector store /// if the zero constant is not re-used, since one instructions and one register /// live range will be removed. /// /// For example, the final generated code should be: /// /// stp xzr, xzr, [x0] /// /// instead of: /// /// movi v0.2d, #0 /// str q0, [x0] /// static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or // 2, 3 or 4 i32 elements. int NumVecElts = VT.getVectorNumElements(); if (!(((NumVecElts == 2 || NumVecElts == 3) && VT.getVectorElementType().getSizeInBits() == 64) || ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && VT.getVectorElementType().getSizeInBits() == 32))) return SDValue(); if (StVal.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // If the zero constant has more than one use then the vector store could be // better since the constant mov will be amortized and stp q instructions // should be able to be formed. if (!StVal.hasOneUse()) return SDValue(); // If the store is truncating then it's going down to i16 or smaller, which // means it can be implemented in a single store anyway. if (St.isTruncatingStore()) return SDValue(); // If the immediate offset of the address operand is too large for the stp // instruction, then bail out. if (DAG.isBaseWithConstantOffset(St.getBasePtr())) { int64_t Offset = St.getBasePtr()->getConstantOperandVal(1); if (Offset < -512 || Offset > 504) return SDValue(); } for (int I = 0; I < NumVecElts; ++I) { SDValue EltVal = StVal.getOperand(I); if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) return SDValue(); } // Use a CopyFromReg WZR/XZR here to prevent // DAGCombiner::MergeConsecutiveStores from undoing this transformation. SDLoc DL(&St); unsigned ZeroReg; EVT ZeroVT; if (VT.getVectorElementType().getSizeInBits() == 32) { ZeroReg = AArch64::WZR; ZeroVT = MVT::i32; } else { ZeroReg = AArch64::XZR; ZeroVT = MVT::i64; } SDValue SplatVal = DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT); return splitStoreSplat(DAG, St, SplatVal, NumVecElts); } /// Replace a splat of a scalar to a vector store by scalar stores of the scalar /// value. The load store optimizer pass will merge them to store pair stores. /// This has better performance than a splat of the scalar followed by a split /// vector store. Even if the stores are not merged it is four stores vs a dup, /// followed by an ext.b and two stores. static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); // Don't replace floating point stores, they possibly won't be transformed to // stp because of the store pair suppress pass. if (VT.isFloatingPoint()) return SDValue(); // We can express a splat as store pair(s) for 2 or 4 elements. unsigned NumVecElts = VT.getVectorNumElements(); if (NumVecElts != 4 && NumVecElts != 2) return SDValue(); // If the store is truncating then it's going down to i16 or smaller, which // means it can be implemented in a single store anyway. if (St.isTruncatingStore()) return SDValue(); // Check that this is a splat. // Make sure that each of the relevant vector element locations are inserted // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32. std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1); SDValue SplatVal; for (unsigned I = 0; I < NumVecElts; ++I) { // Check for insert vector elements. if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) return SDValue(); // Check that same value is inserted at each vector element. if (I == 0) SplatVal = StVal.getOperand(1); else if (StVal.getOperand(1) != SplatVal) return SDValue(); // Check insert element index. ConstantSDNode *CIndex = dyn_cast(StVal.getOperand(2)); if (!CIndex) return SDValue(); uint64_t IndexVal = CIndex->getZExtValue(); if (IndexVal >= NumVecElts) return SDValue(); IndexNotInserted.reset(IndexVal); StVal = StVal.getOperand(0); } // Check that all vector element locations were inserted to. if (IndexNotInserted.any()) return SDValue(); return splitStoreSplat(DAG, St, SplatVal, NumVecElts); } static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { StoreSDNode *S = cast(N); if (S->isVolatile() || S->isIndexed()) return SDValue(); SDValue StVal = S->getValue(); EVT VT = StVal.getValueType(); if (!VT.isVector()) return SDValue(); // If we get a splat of zeros, convert this vector store to a store of // scalars. They will be merged into store pairs of xzr thereby removing one // instruction and one register. if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) return ReplacedZeroSplat; // FIXME: The logic for deciding if an unaligned store should be split should // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. if (!Subtarget->isMisaligned128StoreSlow()) return SDValue(); // Don't split at -Oz. if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); // Don't split v2i64 vectors. Memcpy lowering produces those and splitting // those up regresses performance on micro-benchmarks and olden/bh. if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) return SDValue(); // Split unaligned 16B stores. They are terrible for performance. // Don't split stores with alignment of 1 or 2. Code that uses clang vector // extensions can use this to mark that it does not want splitting to happen // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of // eliminating alignment hazards is only 1 in 8 for alignment of 2. if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || S->getAlignment() <= 2) return SDValue(); // If we get a splat of a scalar convert this vector store to a store of // scalars. They will be merged into store pairs thereby removing two // instructions. if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S)) return ReplacedSplat; SDLoc DL(S); unsigned NumElts = VT.getVectorNumElements() / 2; // Split VT into two. EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(0, DL, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(NumElts, DL, MVT::i64)); SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), S->getAlignment(), S->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(8, DL, MVT::i64)); return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, S->getPointerInfo(), S->getAlignment(), S->getMemOperand()->getFlags()); } /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); unsigned LoadIdx = IsLaneOp ? 1 : 0; SDNode *LD = N->getOperand(LoadIdx).getNode(); // If it is not LOAD, can not do such combine. if (LD->getOpcode() != ISD::LOAD) return SDValue(); // The vector lane must be a constant in the LD1LANE opcode. SDValue Lane; if (IsLaneOp) { Lane = N->getOperand(2); auto *LaneC = dyn_cast(Lane); if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); } LoadSDNode *LoadSDN = cast(LD); EVT MemVT = LoadSDN->getMemoryVT(); // Check if memory operand is the same type as the vector element. if (MemVT != VT.getVectorElementType()) return SDValue(); // Check if there are other uses. If so, do not combine as it will introduce // an extra load. for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; ++UI) { if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. continue; if (*UI != N) return SDValue(); } SDValue Addr = LD->getOperand(1); SDValue Vector = N->getOperand(0); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { uint32_t IncVal = CInc->getZExtValue(); unsigned NumBytes = VT.getScalarSizeInBits() / 8; if (IncVal != NumBytes) continue; Inc = DAG.getRegister(AArch64::XZR, MVT::i64); } // To avoid cycle construction make sure that neither the load nor the add // are predecessors to each other or the Vector. SmallPtrSet Visited; SmallVector Worklist; - Visited.insert(N); + Visited.insert(Addr.getNode()); Worklist.push_back(User); Worklist.push_back(LD); Worklist.push_back(Vector.getNode()); if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) || SDNode::hasPredecessorHelper(User, Visited, Worklist)) continue; SmallVector Ops; Ops.push_back(LD->getOperand(0)); // Chain if (IsLaneOp) { Ops.push_back(Vector); // The vector to be inserted Ops.push_back(Lane); // The lane to be inserted in the vector } Ops.push_back(Addr); Ops.push_back(Inc); EVT Tys[3] = { VT, MVT::i64, MVT::Other }; SDVTList SDTys = DAG.getVTList(Tys); unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, MemVT, LoadSDN->getMemOperand()); // Update the uses. SDValue NewResults[] = { SDValue(LD, 0), // The result of load SDValue(UpdN.getNode(), 2) // Chain }; DCI.CombineTo(LD, NewResults); DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register break; } return SDValue(); } /// Simplify ``Addr`` given that the top byte of it is ignored by HW during /// address translation. static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { APInt DemandedMask = APInt::getLowBitsSet(64, 56); KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); return true; } return false; } static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) return Split; if (Subtarget->supportsAddressTopByteIgnored() && performTBISimplification(N->getOperand(2), DCI, DAG)) return SDValue(N, 0); return SDValue(); } /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); unsigned AddrOpIdx = N->getNumOperands() - 1; SDValue Addr = N->getOperand(AddrOpIdx); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // Check that the add is independent of the load/store. Otherwise, folding // it would create a cycle. SmallPtrSet Visited; SmallVector Worklist; Visited.insert(Addr.getNode()); Worklist.push_back(N); Worklist.push_back(User); if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || SDNode::hasPredecessorHelper(User, Visited, Worklist)) continue; // Find the new opcode for the updating load/store. bool IsStore = false; bool IsLaneOp = false; bool IsDupOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: llvm_unreachable("unexpected intrinsic for Neon base update"); case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; NumVecs = 2; break; case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; NumVecs = 3; break; case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; NumVecs = 4; break; case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; NumVecs = 2; IsStore = true; break; case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; NumVecs = 3; IsStore = true; break; case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; NumVecs = 4; IsStore = true; break; case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; NumVecs = 2; break; case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; NumVecs = 3; break; case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; NumVecs = 4; break; case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; NumVecs = 2; IsStore = true; break; case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; NumVecs = 3; IsStore = true; break; case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; NumVecs = 4; IsStore = true; break; case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; NumVecs = 2; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; NumVecs = 3; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; NumVecs = 4; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; NumVecs = 2; IsLaneOp = true; break; case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; NumVecs = 3; IsLaneOp = true; break; case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; NumVecs = 4; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; NumVecs = 2; IsStore = true; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; NumVecs = 3; IsStore = true; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; NumVecs = 4; IsStore = true; IsLaneOp = true; break; } EVT VecTy; if (IsStore) VecTy = N->getOperand(2).getValueType(); else VecTy = N->getValueType(0); // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { uint32_t IncVal = CInc->getZExtValue(); unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (IsLaneOp || IsDupOp) NumBytes /= VecTy.getVectorNumElements(); if (IncVal != NumBytes) continue; Inc = DAG.getRegister(AArch64::XZR, MVT::i64); } SmallVector Ops; Ops.push_back(N->getOperand(0)); // Incoming chain // Load lane and store have vector list as input. if (IsLaneOp || IsStore) for (unsigned i = 2; i < AddrOpIdx; ++i) Ops.push_back(N->getOperand(i)); Ops.push_back(Addr); // Base register Ops.push_back(Inc); // Return Types. EVT Tys[6]; unsigned NumResultVecs = (IsStore ? 0 : NumVecs); unsigned n; for (n = 0; n < NumResultVecs; ++n) Tys[n] = VecTy; Tys[n++] = MVT::i64; // Type of write back register Tys[n] = MVT::Other; // Type of the chain SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); MemIntrinsicSDNode *MemInt = cast(N); SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, MemInt->getMemoryVT(), MemInt->getMemOperand()); // Update the uses. std::vector NewResults; for (unsigned i = 0; i < NumResultVecs; ++i) { NewResults.push_back(SDValue(UpdN.getNode(), i)); } NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); break; } return SDValue(); } // Checks to see if the value is the prescribed width and returns information // about its extension mode. static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { ExtType = ISD::NON_EXTLOAD; switch(V.getNode()->getOpcode()) { default: return false; case ISD::LOAD: { LoadSDNode *LoadNode = cast(V.getNode()); if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { ExtType = LoadNode->getExtensionType(); return true; } return false; } case ISD::AssertSext: { VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); if ((TypeNode->getVT() == MVT::i8 && width == 8) || (TypeNode->getVT() == MVT::i16 && width == 16)) { ExtType = ISD::SEXTLOAD; return true; } return false; } case ISD::AssertZext: { VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); if ((TypeNode->getVT() == MVT::i8 && width == 8) || (TypeNode->getVT() == MVT::i16 && width == 16)) { ExtType = ISD::ZEXTLOAD; return true; } return false; } case ISD::Constant: case ISD::TargetConstant: { return std::abs(cast(V.getNode())->getSExtValue()) < 1LL << (width - 1); } } return true; } // This function does a whole lot of voodoo to determine if the tests are // equivalent without and with a mask. Essentially what happens is that given a // DAG resembling: // // +-------------+ +-------------+ +-------------+ +-------------+ // | Input | | AddConstant | | CompConstant| | CC | // +-------------+ +-------------+ +-------------+ +-------------+ // | | | | // V V | +----------+ // +-------------+ +----+ | | // | ADD | |0xff| | | // +-------------+ +----+ | | // | | | | // V V | | // +-------------+ | | // | AND | | | // +-------------+ | | // | | | // +-----+ | | // | | | // V V V // +-------------+ // | CMP | // +-------------+ // // The AND node may be safely removed for some combinations of inputs. In // particular we need to take into account the extension type of the Input, // the exact values of AddConstant, CompConstant, and CC, along with the nominal // width of the input (this can work for any width inputs, the above graph is // specific to 8 bits. // // The specific equations were worked out by generating output tables for each // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The // problem was simplified by working with 4 bit inputs, which means we only // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 // patterns present in both extensions (0,7). For every distinct set of // AddConstant and CompConstants bit patterns we can consider the masked and // unmasked versions to be equivalent if the result of this function is true for // all 16 distinct bit patterns of for the current extension type of Input (w0). // // sub w8, w0, w1 // and w10, w8, #0x0f // cmp w8, w2 // cset w9, AArch64CC // cmp w10, w2 // cset w11, AArch64CC // cmp w9, w11 // cset w0, eq // ret // // Since the above function shows when the outputs are equivalent it defines // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and // would be expensive to run during compiles. The equations below were written // in a test harness that confirmed they gave equivalent outputs to the above // for all inputs function, so they can be used determine if the removal is // legal instead. // // isEquivalentMaskless() is the code for testing if the AND can be removed // factored out of the DAG recognition as the DAG can take several forms. static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant) { // By being careful about our equations and only writing the in term // symbolic values and well known constants (0, 1, -1, MaxUInt) we can // make them generally applicable to all bit widths. int MaxUInt = (1 << width); // For the purposes of these comparisons sign extending the type is // equivalent to zero extending the add and displacing it by half the integer // width. Provided we are careful and make sure our equations are valid over // the whole range we can just adjust the input and avoid writing equations // for sign extended inputs. if (ExtType == ISD::SEXTLOAD) AddConstant -= (1 << (width-1)); switch(CC) { case AArch64CC::LE: case AArch64CC::GT: if ((AddConstant == 0) || (CompConstant == MaxUInt - 1 && AddConstant < 0) || (AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) return true; break; case AArch64CC::LT: case AArch64CC::GE: if ((AddConstant == 0) || (AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) return true; break; case AArch64CC::HI: case AArch64CC::LS: if ((AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant >= -1 && CompConstant < AddConstant + MaxUInt)) return true; break; case AArch64CC::PL: case AArch64CC::MI: if ((AddConstant == 0) || (AddConstant > 0 && CompConstant <= 0) || (AddConstant < 0 && CompConstant <= AddConstant)) return true; break; case AArch64CC::LO: case AArch64CC::HS: if ((AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant >= 0 && CompConstant <= AddConstant + MaxUInt)) return true; break; case AArch64CC::EQ: case AArch64CC::NE: if ((AddConstant > 0 && CompConstant < 0) || (AddConstant < 0 && CompConstant >= 0 && CompConstant < AddConstant + MaxUInt) || (AddConstant >= 0 && CompConstant >= 0 && CompConstant >= AddConstant) || (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) return true; break; case AArch64CC::VS: case AArch64CC::VC: case AArch64CC::AL: case AArch64CC::NV: return true; case AArch64CC::Invalid: break; } return false; } static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex) { unsigned CC = cast(N->getOperand(CCIndex))->getSExtValue(); SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); unsigned CondOpcode = SubsNode->getOpcode(); if (CondOpcode != AArch64ISD::SUBS) return SDValue(); // There is a SUBS feeding this condition. Is it fed by a mask we can // use? SDNode *AndNode = SubsNode->getOperand(0).getNode(); unsigned MaskBits = 0; if (AndNode->getOpcode() != ISD::AND) return SDValue(); if (ConstantSDNode *CN = dyn_cast(AndNode->getOperand(1))) { uint32_t CNV = CN->getZExtValue(); if (CNV == 255) MaskBits = 8; else if (CNV == 65535) MaskBits = 16; } if (!MaskBits) return SDValue(); SDValue AddValue = AndNode->getOperand(0); if (AddValue.getOpcode() != ISD::ADD) return SDValue(); // The basic dag structure is correct, grab the inputs and validate them. SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); SDValue SubsInputValue = SubsNode->getOperand(1); // The mask is present and the provenance of all the values is a smaller type, // lets see if the mask is superfluous. if (!isa(AddInputValue2.getNode()) || !isa(SubsInputValue.getNode())) return SDValue(); ISD::LoadExtType ExtType; if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || !checkValueWidth(AddInputValue2, MaskBits, ExtType) || !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) return SDValue(); if(!isEquivalentMaskless(CC, MaskBits, ExtType, cast(AddInputValue2.getNode())->getSExtValue(), cast(SubsInputValue.getNode())->getSExtValue())) return SDValue(); // The AND is not necessary, remove it. SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), SubsNode->getValueType(1)); SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); return SDValue(N, 0); } // Optimize compare with zero and branch. static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions // will not be produced, as they are conditional branch instructions that do // not set flags. if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) return SDValue(); if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) N = NV.getNode(); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); SDValue CCVal = N->getOperand(2); SDValue Cmp = N->getOperand(3); assert(isa(CCVal) && "Expected a ConstantSDNode here!"); unsigned CC = cast(CCVal)->getZExtValue(); if (CC != AArch64CC::EQ && CC != AArch64CC::NE) return SDValue(); unsigned CmpOpc = Cmp.getOpcode(); if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) return SDValue(); // Only attempt folding if there is only one use of the flag and no use of the // value. if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) return SDValue(); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); assert(LHS.getValueType() == RHS.getValueType() && "Expected the value type to be the same for both operands!"); if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return SDValue(); if (isNullConstant(LHS)) std::swap(LHS, RHS); if (!isNullConstant(RHS)) return SDValue(); if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || LHS.getOpcode() == ISD::SRL) return SDValue(); // Fold the compare into the branch instruction. SDValue BR; if (CC == AArch64CC::EQ) BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); else BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, BR, false); return SDValue(); } // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test // as well as whether the test should be inverted. This code is required to // catch these cases (as opposed to standard dag combines) because // AArch64ISD::TBZ is matched during legalization. static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG) { if (!Op->hasOneUse()) return Op; // We don't handle undef/constant-fold cases below, as they should have // already been taken care of (e.g. and of 0, test of undefined shifted bits, // etc.) // (tbz (trunc x), b) -> (tbz x, b) // This case is just here to enable more of the below cases to be caught. if (Op->getOpcode() == ISD::TRUNCATE && Bit < Op->getValueType(0).getSizeInBits()) { return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. if (Op->getOpcode() == ISD::ANY_EXTEND && Bit < Op->getOperand(0).getValueSizeInBits()) { return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } if (Op->getNumOperands() != 2) return Op; auto *C = dyn_cast(Op->getOperand(1)); if (!C) return Op; switch (Op->getOpcode()) { default: return Op; // (tbz (and x, m), b) -> (tbz x, b) case ISD::AND: if ((C->getZExtValue() >> Bit) & 1) return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); return Op; // (tbz (shl x, c), b) -> (tbz x, b-c) case ISD::SHL: if (C->getZExtValue() <= Bit && (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { Bit = Bit - C->getZExtValue(); return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } return Op; // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x case ISD::SRA: Bit = Bit + C->getZExtValue(); if (Bit >= Op->getValueType(0).getSizeInBits()) Bit = Op->getValueType(0).getSizeInBits() - 1; return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); // (tbz (srl x, c), b) -> (tbz x, b+c) case ISD::SRL: if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { Bit = Bit + C->getZExtValue(); return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } return Op; // (tbz (xor x, -1), b) -> (tbnz x, b) case ISD::XOR: if ((C->getZExtValue() >> Bit) & 1) Invert = !Invert; return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } } // Optimize test single bit zero/non-zero and branch. static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { unsigned Bit = cast(N->getOperand(2))->getZExtValue(); bool Invert = false; SDValue TestSrc = N->getOperand(1); SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); if (TestSrc == NewTestSrc) return SDValue(); unsigned NewOpc = N->getOpcode(); if (Invert) { if (NewOpc == AArch64ISD::TBZ) NewOpc = AArch64ISD::TBNZ; else { assert(NewOpc == AArch64ISD::TBNZ); NewOpc = AArch64ISD::TBZ; } } SDLoc DL(N); return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); } // vselect (v1i1 setcc) -> // vselect (v1iXX setcc) (XX is the size of the compared operand type) // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine // such VSELECT. static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); EVT CCVT = N0.getValueType(); if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || CCVT.getVectorElementType() != MVT::i1) return SDValue(); EVT ResVT = N->getValueType(0); EVT CmpVT = N0.getOperand(0).getValueType(); // Only combine when the result type is of the same size as the compared // operands. if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) return SDValue(); SDValue IfTrue = N->getOperand(1); SDValue IfFalse = N->getOperand(2); SDValue SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, IfTrue, IfFalse); } /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with /// the compare-mask instructions rather than going via NZCV, even if LHS and /// RHS are really scalar. This replaces any scalar setcc in the above pattern /// with a vector one followed by a DUP shuffle on the result. static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); EVT ResVT = N->getValueType(0); if (N0.getOpcode() != ISD::SETCC) return SDValue(); // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered // scalar SetCCResultType. We also don't expect vectors, because we assume // that selects fed by vector SETCCs are canonicalized to VSELECT. assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && "Scalar-SETCC feeding SELECT has unexpected result type!"); // If NumMaskElts == 0, the comparison is larger than select result. The // largest real NEON comparison is 64-bits per lane, which means the result is // at most 32-bits and an illegal vector. Just bail out for now. EVT SrcVT = N0.getOperand(0).getValueType(); // Don't try to do this optimization when the setcc itself has i1 operands. // There are no legal vectors of i1, so this would be pointless. if (SrcVT == MVT::i1) return SDValue(); int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); // Also bail out if the vector CCVT isn't the same size as ResVT. // This can happen if the SETCC operand size doesn't divide the ResVT size // (e.g., f64 vs v3f32). if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) return SDValue(); // Make sure we didn't create illegal types, if we're not supposed to. assert(DCI.isBeforeLegalize() || DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); // First perform a vector comparison, where lane 0 is the one we're interested // in. SDLoc DL(N0); SDValue LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); SDValue RHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); // Now duplicate the comparison mask we want across all other lanes. SmallVector DUPMask(CCVT.getVectorNumElements(), 0); SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), Mask); return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) return N->getOperand(0); return SDValue(); } // If all users of the globaladdr are of the form (globaladdr + constant), find // the smallest constant, fold it into the globaladdr's offset and rewrite the // globaladdr as (globaladdr + constant) - constant. static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM) { auto *GN = cast(N); if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) != AArch64II::MO_NO_FLAG) return SDValue(); uint64_t MinOffset = -1ull; for (SDNode *N : GN->uses()) { if (N->getOpcode() != ISD::ADD) return SDValue(); auto *C = dyn_cast(N->getOperand(0)); if (!C) C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); MinOffset = std::min(MinOffset, C->getZExtValue()); } uint64_t Offset = MinOffset + GN->getOffset(); // Require that the new offset is larger than the existing one. Otherwise, we // can end up oscillating between two possible DAGs, for example, // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1). if (Offset <= uint64_t(GN->getOffset())) return SDValue(); // Check whether folding this offset is legal. It must not go out of bounds of // the referenced object to avoid violating the code model, and must be // smaller than 2^21 because this is the largest offset expressible in all // object formats. // // This check also prevents us from folding negative offsets, which will end // up being treated in the same way as large positive ones. They could also // cause code model violations, and aren't really common enough to matter. if (Offset >= (1 << 21)) return SDValue(); const GlobalValue *GV = GN->getGlobal(); Type *T = GV->getValueType(); if (!T->isSized() || Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) return SDValue(); SDLoc DL(GN); SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset); return DAG.getNode(ISD::SUB, DL, MVT::i64, Result, DAG.getConstant(MinOffset, DL, MVT::i64)); } SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); break; case ISD::ADD: case ISD::SUB: return performAddSubLongCombine(N, DCI, DAG); case ISD::XOR: return performXorCombine(N, DAG, DCI, Subtarget); case ISD::MUL: return performMulCombine(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return performFpToIntCombine(N, DAG, DCI, Subtarget); case ISD::FDIV: return performFDivCombine(N, DAG, DCI, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::AND: return performANDCombine(N, DCI); case ISD::SRL: return performSRLCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: return performExtendCombine(N, DCI, DAG); case ISD::BITCAST: return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); case ISD::SELECT: return performSelectCombine(N, DCI); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::LOAD: if (performTBISimplification(N->getOperand(1), DCI, DAG)) return SDValue(N, 0); break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: case AArch64ISD::TBZ: return performTBZCombine(N, DCI, DAG); case AArch64ISD::CSEL: return performCONDCombine(N, DCI, DAG, 2, 3); case AArch64ISD::DUP: return performPostLD1Combine(N, DCI, false); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: case Intrinsic::aarch64_neon_ld1x2: case Intrinsic::aarch64_neon_ld1x3: case Intrinsic::aarch64_neon_ld1x4: case Intrinsic::aarch64_neon_ld2lane: case Intrinsic::aarch64_neon_ld3lane: case Intrinsic::aarch64_neon_ld4lane: case Intrinsic::aarch64_neon_ld2r: case Intrinsic::aarch64_neon_ld3r: case Intrinsic::aarch64_neon_ld4r: case Intrinsic::aarch64_neon_st2: case Intrinsic::aarch64_neon_st3: case Intrinsic::aarch64_neon_st4: case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: case Intrinsic::aarch64_neon_st1x4: case Intrinsic::aarch64_neon_st2lane: case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); default: break; } break; case ISD::GlobalAddress: return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine()); } return SDValue(); } // Check if the return value is used as only a return value, as otherwise // we can't perform a tail-call. In particular, we need to check for // target ISD nodes that are returns and any other "odd" constructs // that the generic analysis code won't necessarily catch. bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode *Node : Copy->uses()) { if (Node->getOpcode() != AArch64ISD::RET_FLAG) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } // Return whether the an instruction can potentially be optimized to a tail // call. This will cause the optimizers to attempt to move, or duplicate, // return instructions to help enable tail call optimizations for this // instruction. bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return CI->isTailCall(); } bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const { if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) return false; Base = Op->getOperand(0); // All of the indexed addressing mode instructions take a signed // 9 bit immediate offset. if (ConstantSDNode *RHS = dyn_cast(Op->getOperand(1))) { int64_t RHSC = RHS->getSExtValue(); if (Op->getOpcode() == ISD::SUB) RHSC = -(uint64_t)RHSC; if (!isInt<9>(RHSC)) return false; IsInc = (Op->getOpcode() == ISD::ADD); Offset = Op->getOperand(1); return true; } return false; } bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); } else return false; bool IsInc; if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) return false; AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; return true; } bool AArch64TargetLowering::getPostIndexedAddressParts( SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); } else return false; bool IsInc; if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) return false; // Post-indexing updates the base, so it's not a valid transform // if that's not the same as the load's pointer. if (Ptr != Base) return false; AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; return true; } static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) { SDLoc DL(N); SDValue Op = N->getOperand(0); if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) return; Op = SDValue( DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, DAG.getUNDEF(MVT::i32), Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } static void ReplaceReductionResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp) { EVT LoVT, HiVT; SDValue Lo, Hi; SDLoc dl(N); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); Results.push_back(SplitVal); } static std::pair splitInt128(SDValue N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N); SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, DAG.getNode(ISD::SRL, DL, MVT::i128, N, DAG.getConstant(64, DL, MVT::i64))); return std::make_pair(Lo, Hi); } // Create an even/odd pair of X registers holding integer value V. static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDLoc dl(V.getNode()); SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64); SDValue VHi = DAG.getAnyExtOrTrunc( DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)), dl, MVT::i64); if (DAG.getDataLayout().isBigEndian()) std::swap (VLo, VHi); SDValue RegClass = DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32); SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32); SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32); const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; return SDValue( DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); } static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { assert(N->getValueType(0) == MVT::i128 && "AtomicCmpSwap on types less than 128 should be legal"); if (Subtarget->hasLSE()) { // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. SDValue Ops[] = { createGPRPairNode(DAG, N->getOperand(2)), // Compare value createGPRPairNode(DAG, N->getOperand(3)), // Store value N->getOperand(1), // Ptr N->getOperand(0), // Chain in }; MachineMemOperand *MemOp = cast(N)->getMemOperand(); unsigned Opcode; switch (MemOp->getOrdering()) { case AtomicOrdering::Monotonic: Opcode = AArch64::CASPX; break; case AtomicOrdering::Acquire: Opcode = AArch64::CASPAX; break; case AtomicOrdering::Release: Opcode = AArch64::CASPLX; break; case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: Opcode = AArch64::CASPALX; break; default: llvm_unreachable("Unexpected ordering!"); } MachineSDNode *CmpSwap = DAG.getMachineNode( Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops); DAG.setNodeMemRefs(CmpSwap, {MemOp}); unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; if (DAG.getDataLayout().isBigEndian()) std::swap(SubReg1, SubReg2); Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, SDValue(CmpSwap, 0))); Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, SDValue(CmpSwap, 0))); Results.push_back(SDValue(CmpSwap, 1)); // Chain out return; } auto Desired = splitInt128(N->getOperand(2), DAG); auto New = splitInt128(N->getOperand(3), DAG); SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, New.first, New.second, N->getOperand(0)}; SDNode *CmpSwap = DAG.getMachineNode( AArch64::CMP_SWAP_128, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); MachineMemOperand *MemOp = cast(N)->getMemOperand(); DAG.setNodeMemRefs(cast(CmpSwap), {MemOp}); Results.push_back(SDValue(CmpSwap, 0)); Results.push_back(SDValue(CmpSwap, 1)); Results.push_back(SDValue(CmpSwap, 3)); } void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this"); case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; case AArch64ISD::UADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); return; case AArch64ISD::SMINV: ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); return; case AArch64ISD::UMINV: ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); return; case AArch64ISD::SMAXV: ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); return; case AArch64ISD::UMAXV: ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); return; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); // Let normal code take care of it by not adding anything to Results. return; case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); return; } } bool AArch64TargetLowering::useLoadStackGuardNode() const { if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) return TargetLowering::useLoadStackGuardNode(); return true; } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are three or more FDIVs. return 3; } TargetLoweringBase::LegalizeTypeAction AArch64TargetLowering::getPreferredVectorAction(MVT VT) const { // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, // v4i16, v2i32 instead of to promote. if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 || VT == MVT::v1f32) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); return Size == 128; } // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (AI->isFloatingPointOperation()) return AtomicExpansionKind::CmpXChg; unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size > 128) return AtomicExpansionKind::None; // Nand not supported in LSE. if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; // Leave 128 bits to LLSC. return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; } TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { // If subtarget has LSE, leave cmpxchg intact for codegen. if (Subtarget->hasLSE()) return AtomicExpansionKind::None; // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. if (getTargetMachine().getOptLevel() == 0) return AtomicExpansionKind::None; return AtomicExpansionKind::LLSC; } Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast(Addr->getType())->getElementType(); bool IsAcquire = isAcquireOrStronger(Ord); // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i64, i64} and we have to recombine them into a // single i128 here. if (ValTy->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; Function *Ldxr = Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); } Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); Type *EltTy = cast(Addr->getType())->getElementType(); const DataLayout &DL = M->getDataLayout(); IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy)); Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy); return Builder.CreateBitCast(Trunc, EltTy); } void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilder<> &Builder) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); bool IsRelease = isReleaseOrStronger(Ord); // Since the intrinsics must have legal type, the i128 intrinsics take two // parameters: "i64, i64". We must marshal Val into the appropriate form // before the call. if (Val->getType()->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; Function *Stxr = Intrinsic::getDeclaration(M, Int); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; Type *Tys[] = { Addr->getType() }; Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); const DataLayout &DL = M->getDataLayout(); IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); Val = Builder.CreateBitCast(Val, IntValTy); return Builder.CreateCall(Stxr, {Builder.CreateZExtOrBitCast( Val, Stxr->getFunctionType()->getParamType(0)), Addr}); } bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return Ty->isArrayTy(); } bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, EVT) const { return false; } static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), Offset), IRB.getInt8PtrTy()->getPointerTo(0)); } Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { // Android provides a fixed TLS slot for the stack cookie. See the definition // of TLS_SLOT_STACK_GUARD in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h if (Subtarget->isTargetAndroid()) return UseTlsOffset(IRB, 0x28); // Fuchsia is similar. // defines ZX_TLS_STACK_GUARD_OFFSET with this value. if (Subtarget->isTargetFuchsia()) return UseTlsOffset(IRB, -0x10); return TargetLowering::getIRStackGuard(IRB); } void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) { // MSVC CRT has a global variable holding security cookie. M.getOrInsertGlobal("__security_cookie", Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( "__security_check_cookie", Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext())); if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) { F->setCallingConv(CallingConv::Win64); F->addAttribute(1, Attribute::AttrKind::InReg); } return; } TargetLowering::insertSSPDeclarations(M); } Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { // MSVC CRT has a global variable holding security cookie. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) return M.getGlobalVariable("__security_cookie"); return TargetLowering::getSDagStackGuard(M); } Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) return M.getFunction("__security_check_cookie"); return TargetLowering::getSSPStackGuardCheck(M); } Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h if (Subtarget->isTargetAndroid()) return UseTlsOffset(IRB, 0x48); // Fuchsia is similar. // defines ZX_TLS_UNSAFE_SP_OFFSET with this value. if (Subtarget->isTargetFuchsia()) return UseTlsOffset(IRB, -0x8); return TargetLowering::getSafeStackPointerLocation(IRB); } bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { // Only sink 'and' mask to cmp use block if it is masking a single bit, since // this is likely to be fold the and/cmp/br into a single tbz instruction. It // may be beneficial to sink in other cases, but we would have to check that // the cmp would not get folded into the br to form a cbz for these to be // beneficial. ConstantInt* Mask = dyn_cast(AndI.getOperand(1)); if (!Mask) return false; return Mask->getValue().isPowerOf2(); +} + +bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG, + SDNode *N) const { + if (DAG.getMachineFunction().getFunction().hasMinSize() && + !Subtarget->isTargetWindows()) + return false; + return true; } void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in AArch64unctionInfo. AArch64FunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void AArch64TargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (AArch64::GPR64RegClass.contains(*I)) RC = &AArch64::GPR64RegClass; else if (AArch64::FPR64RegClass.contains(*I)) RC = &AArch64::FPR64RegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on AArch64 is expensive. However, when aggressively // optimizing for code size, we prefer to use a div instruction, as it is // usually smaller than the alternative sequence. // The exception to this is vector division. Since AArch64 doesn't have vector // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { // We want inc-of-add for scalars and sub-of-not for vectors. return VT.isScalarInteger(); } bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); } unsigned AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) return getPointerTy(DL).getSizeInBits(); return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; } void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } // Unlike X86, we let frame lowering assign offsets to all catch objects. bool AArch64TargetLowering::needsFixedCatchObjects() const { return false; } Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.h (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.h (revision 351709) @@ -1,748 +1,744 @@ //==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that AArch64 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H #include "AArch64.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Instruction.h" namespace llvm { namespace AArch64ISD { enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. CALL, // Function call. // Produces the full sequence of instructions for getting the thread pointer // offset of a variable into X0, using the TLSDesc model. TLSDESC_CALLSEQ, ADRP, // Page address of a TargetGlobalAddress operand. ADR, // ADR ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. LOADgot, // Load from automatically generated descriptor (e.g. Global // Offset Table, TLS record). RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand. BRCOND, // Conditional branch instruction; "b.cond". CSEL, FCSEL, // Conditional move instruction. CSINV, // Conditional select invert. CSNEG, // Conditional select negate. CSINC, // Conditional select increment. // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on // ELF. THREAD_POINTER, ADC, SBC, // adc, sbc instructions // Arithmetic instructions which write flags. ADDS, SUBS, ADCS, SBCS, ANDS, // Conditional compares. Operands: left,right,falsecc,cc,flags CCMP, CCMN, FCCMP, // Floating point comparison FCMP, // Scalar extract EXTR, // Scalar-to-vector duplication DUP, DUPLANE8, DUPLANE16, DUPLANE32, DUPLANE64, // Vector immedate moves MOVI, MOVIshift, MOVIedit, MOVImsl, FMOV, MVNIshift, MVNImsl, // Vector immediate ops BICi, ORRi, // Vector bit select: similar to ISD::VSELECT but not all bits within an // element must be identical. BSL, // Vector arithmetic negation NEG, // Vector shuffles ZIP1, ZIP2, UZP1, UZP2, TRN1, TRN2, REV16, REV32, REV64, EXT, // Vector shift by scalar VSHL, VLSHR, VASHR, // Vector shift by scalar (again) SQSHL_I, UQSHL_I, SQSHLU_I, SRSHR_I, URSHR_I, // Vector comparisons CMEQ, CMGE, CMGT, CMHI, CMHS, FCMEQ, FCMGE, FCMGT, // Vector zero comparisons CMEQz, CMGEz, CMGTz, CMLEz, CMLTz, FCMEQz, FCMGEz, FCMGTz, FCMLEz, FCMLTz, // Vector across-lanes addition // Only the lower result lane is defined. SADDV, UADDV, // Vector across-lanes min/max // Only the lower result lane is defined. SMINV, UMINV, SMAXV, UMAXV, // Vector bitwise negation NOT, // Vector bitwise selection BIT, // Compare-and-branch CBZ, CBNZ, TBZ, TBNZ, // Tail calls TC_RETURN, // Custom prefetch handling PREFETCH, // {s|u}int to FP within a FP register. SITOF, UITOF, /// Natural vector cast. ISD::BITCAST is not natural in the big-endian /// world w.r.t vectors; which causes additional REV instructions to be /// generated to compensate for the byte-swapping. But sometimes we do /// need to re-interpret the data in SIMD vector registers in big-endian /// mode without emitting such REV instructions. NVCAST, SMULL, UMULL, // Reciprocal estimates and steps. FRECPE, FRECPS, FRSQRTE, FRSQRTS, // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, LD4post, ST2post, ST3post, ST4post, LD1x2post, LD1x3post, LD1x4post, ST1x2post, ST1x3post, ST1x4post, LD1DUPpost, LD2DUPpost, LD3DUPpost, LD4DUPpost, LD1LANEpost, LD2LANEpost, LD3LANEpost, LD4LANEpost, ST2LANEpost, ST3LANEpost, ST4LANEpost, STG, STZG, ST2G, STZ2G }; } // end namespace AArch64ISD namespace { // Any instruction that defines a 32-bit result zeros out the high half of the // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may // be copying from a truncate. But any other 32-bit operation will zero-extend // up to 64 bits. // FIXME: X86 also checks for CMOV here. Do we need something similar? static inline bool isDef32(const SDNode &N) { unsigned Opc = N.getOpcode(); return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG && Opc != ISD::CopyFromReg; } } // end anonymous namespace class AArch64Subtarget; class AArch64TargetMachine; class AArch64TargetLowering : public TargetLowering { public: explicit AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI); /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const; /// Determine which of the bits specified in Mask are known to be either zero /// or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const override; MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; /// Returns true if the target allows unaligned memory accesses of the /// specified type. bool allowsMisalignedMemoryAccesses( EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *Fast = nullptr) const override; /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; const char *getTargetNodeName(unsigned Opcode) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Addrspacecasts are always noops. return true; } /// This method returns a target specific FastISel object, or null if the /// target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; /// Return true if the given shuffle mask can be codegen'd directly, or if it /// should be stack expanded. bool isShuffleMaskLegal(ArrayRef M, EVT VT) const override; /// Return the ISD::SETCC ValueType. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; MachineBasicBlock *EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override; bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override; bool isTruncateFree(Type *Ty1, Type *Ty2) const override; bool isTruncateFree(EVT VT1, EVT VT2) const override; bool isProfitableToHoist(Instruction *I) const override; bool isZExtFree(Type *Ty1, Type *Ty2) const override; bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const override; bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; bool shouldConsiderGEPOffsetSplit() const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const override; /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; /// Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. /// If the AM is supported, the return value must be >= 0. /// If the AM is not supported, it returns a negative value. int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this method /// returns true, otherwise fmuladd is expanded to fmul + fadd. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; /// Returns false if N is a bit extraction pattern of (X >> C) & Mask. bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; bool useLoadStackGuardNode() const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; /// If the target has a standard location for the stack protector cookie, /// returns the address of that location. Otherwise, returns nullptr. Value *getIRStackGuard(IRBuilder<> &IRB) const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; Function *getSSPStackGuardCheck(const Module &M) const override; /// If the target has a standard location for the unsafe stack pointer, /// returns the address of that location. Otherwise, returns nullptr. Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. unsigned getExceptionPointerRegister(const Constant *PersonalityFn) const override { // FIXME: This is a guess. Has this been defined yet? return AArch64::X0; } /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. unsigned getExceptionSelectorRegister(const Constant *PersonalityFn) const override { // FIXME: This is a guess. Has this been defined yet? return AArch64::X1; } bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const override { // Do not merge to float value size (128 bytes) if no implicit // float attribute is set. bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (NoFloat) return (MemVT.getSizeInBits() <= 64); return true; } bool isCheapToSpeculateCttz() const override { return true; } bool isCheapToSpeculateCtlz() const override { return true; } bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; bool hasAndNotCompare(SDValue V) const override { // We can use bics for any scalar. return V.getValueType().isScalarInteger(); } bool hasAndNot(SDValue Y) const override { EVT VT = Y.getValueType(); if (!VT.isVector()) return hasAndNotCompare(Y); return VT.getSizeInBits() >= 64; // vector 'bic' } - bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { - if (DAG.getMachineFunction().getFunction().hasMinSize()) - return false; - return true; - } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override { // For vectors, we don't have a preference.. if (XVT.isVector()) return false; auto VTIsOk = [](EVT VT) -> bool { return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64; }; // We are ok with KeptBitsVT being byte/word/dword, what SXT supports. // XVT will be larger than KeptBitsVT. MVT KeptBitsVT = MVT::getIntegerVT(KeptBits); return VTIsOk(XVT) && VTIsOk(KeptBitsVT); } bool preferIncOfAddToSubOfNot(EVT VT) const override; bool hasBitPreservingFPLogic(EVT VT) const override { // FIXME: Is this always true? It should be true for vectors at least. return VT == MVT::f32 || VT == MVT::f64; } bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getFunction().hasFnAttribute(Attribute::NoUnwind); } void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const override; bool supportSwiftError() const override { return true; } /// Enable aggressive FMA fusion on targets that want it. bool enableAggressiveFMAFusion(EVT VT) const override; /// Returns the size of the platform's va_list object. unsigned getVaListSizeInBits(const DataLayout &DL) const override; /// Returns true if \p VecTy is a legal interleaved access type. This /// function checks the vector element type and the overall width of the /// vector. bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const; /// Returns the number of interleaved accesses that will be generated when /// lowering accesses of the given type. unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const; MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; /// Used for exception handling on Win64. bool needsFixedCatchObjects() const override; private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; bool isExtFreeImpl(const Instruction *Ext) const override; void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; SDValue LowerCall(CallLoweringInfo & /*CLI*/, SmallVectorImpl &InVals) const override; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const; /// Finds the incoming stack arguments which overlap the given fixed stack /// object and incorporates their load into the current chain. This prevents /// an upcoming store from clobbering the stack argument before it's used. SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const; bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const; void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL, SDValue &Chain) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const; SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const; SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const; SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const; template SDValue getGOT(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; template SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; template SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; template SDValue getAddrTiny(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL, SelectionDAG &DAG) const; SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override; SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps) const override; unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override; std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; const char *LowerXConstraint(EVT ConstraintVT) const override; void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const override; unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "Q") return InlineAsm::Constraint_Q; // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are // followed by llvm_unreachable so we'll leave them unimplemented in // the backend for now. return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const; bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; void finalizeLowering(MachineFunction &MF) const override; }; namespace AArch64 { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } // end namespace AArch64 } // end namespace llvm #endif Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64InstrInfo.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64InstrInfo.cpp (revision 351709) @@ -1,5561 +1,5573 @@ //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains the AArch64 implementation of the TargetInstrInfo class. // //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" static cl::opt TBZDisplacementBits( "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); static cl::opt CBZDisplacementBits( "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); static cl::opt BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)")); AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, AArch64::CATCHRET), RI(STI.getTargetTriple()), Subtarget(STI) {} /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { const MachineBasicBlock &MBB = *MI.getParent(); const MachineFunction *MF = MBB.getParent(); const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); { auto Op = MI.getOpcode(); if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); } // FIXME: We currently only handle pseudoinstructions that don't get expanded // before the assembly printer. unsigned NumBytes = 0; const MCInstrDesc &Desc = MI.getDesc(); switch (Desc.getOpcode()) { default: // Anything not explicitly designated otherwise is a normal 4-byte insn. NumBytes = 4; break; case TargetOpcode::DBG_VALUE: case TargetOpcode::EH_LABEL: case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: NumBytes = 0; break; case TargetOpcode::STACKMAP: // The upper bound for a stackmap intrinsic is the full length of its shadow NumBytes = StackMapOpers(&MI).getNumPatchBytes(); assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); break; case TargetOpcode::PATCHPOINT: // The size of the patchpoint intrinsic is the number of bytes requested NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); break; case AArch64::TLSDESC_CALLSEQ: // This gets lowered to an instruction sequence which takes 16 bytes NumBytes = 16; break; case AArch64::JumpTableDest32: case AArch64::JumpTableDest16: case AArch64::JumpTableDest8: NumBytes = 12; break; case AArch64::SPACE: NumBytes = MI.getOperand(1).getImm(); break; } return NumBytes; } static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl &Cond) { // Block ends with fall-through condbranch. switch (LastInst->getOpcode()) { default: llvm_unreachable("Unknown branch instruction?"); case AArch64::Bcc: Target = LastInst->getOperand(1).getMBB(); Cond.push_back(LastInst->getOperand(0)); break; case AArch64::CBZW: case AArch64::CBZX: case AArch64::CBNZW: case AArch64::CBNZX: Target = LastInst->getOperand(1).getMBB(); Cond.push_back(MachineOperand::CreateImm(-1)); Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); Cond.push_back(LastInst->getOperand(0)); break; case AArch64::TBZW: case AArch64::TBZX: case AArch64::TBNZW: case AArch64::TBNZX: Target = LastInst->getOperand(2).getMBB(); Cond.push_back(MachineOperand::CreateImm(-1)); Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); Cond.push_back(LastInst->getOperand(0)); Cond.push_back(LastInst->getOperand(1)); } } static unsigned getBranchDisplacementBits(unsigned Opc) { switch (Opc) { default: llvm_unreachable("unexpected opcode!"); case AArch64::B: return 64; case AArch64::TBNZW: case AArch64::TBZW: case AArch64::TBNZX: case AArch64::TBZX: return TBZDisplacementBits; case AArch64::CBNZW: case AArch64::CBZW: case AArch64::CBNZX: case AArch64::CBZX: return CBZDisplacementBits; case AArch64::Bcc: return BCCDisplacementBits; } } bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, int64_t BrOffset) const { unsigned Bits = getBranchDisplacementBits(BranchOp); assert(Bits >= 3 && "max branch displacement must be enough to jump" "over conditional branch expansion"); return isIntN(Bits, BrOffset / 4); } MachineBasicBlock * AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: llvm_unreachable("unexpected opcode!"); case AArch64::B: return MI.getOperand(0).getMBB(); case AArch64::TBZW: case AArch64::TBNZW: case AArch64::TBZX: case AArch64::TBNZX: return MI.getOperand(2).getMBB(); case AArch64::CBZW: case AArch64::CBNZW: case AArch64::CBZX: case AArch64::CBNZX: case AArch64::Bcc: return MI.getOperand(1).getMBB(); } } // Branch analysis. bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); if (I == MBB.end()) return false; if (!isUnpredicatedTerminator(*I)) return false; // Get the last instruction in the block. MachineInstr *LastInst = &*I; // If there is only one terminator instruction, process it. unsigned LastOpc = LastInst->getOpcode(); if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { if (isUncondBranchOpcode(LastOpc)) { TBB = LastInst->getOperand(0).getMBB(); return false; } if (isCondBranchOpcode(LastOpc)) { // Block ends with fall-through condbranch. parseCondBranch(LastInst, TBB, Cond); return false; } return true; // Can't handle indirect branch. } // Get the instruction before it if it is a terminator. MachineInstr *SecondLastInst = &*I; unsigned SecondLastOpc = SecondLastInst->getOpcode(); // If AllowModify is true and the block ends with two or more unconditional // branches, delete all but the first unconditional branch. if (AllowModify && isUncondBranchOpcode(LastOpc)) { while (isUncondBranchOpcode(SecondLastOpc)) { LastInst->eraseFromParent(); LastInst = SecondLastInst; LastOpc = LastInst->getOpcode(); if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { // Return now the only terminator is an unconditional branch. TBB = LastInst->getOperand(0).getMBB(); return false; } else { SecondLastInst = &*I; SecondLastOpc = SecondLastInst->getOpcode(); } } } // If there are three terminators, we don't know what sort of block this is. if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) return true; // If the block ends with a B and a Bcc, handle it. if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { parseCondBranch(SecondLastInst, TBB, Cond); FBB = LastInst->getOperand(0).getMBB(); return false; } // If the block ends with two unconditional branches, handle it. The second // one is not executed, so remove it. if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { TBB = SecondLastInst->getOperand(0).getMBB(); I = LastInst; if (AllowModify) I->eraseFromParent(); return false; } // ...likewise if it ends with an indirect branch followed by an unconditional // branch. if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { I = LastInst; if (AllowModify) I->eraseFromParent(); return true; } // Otherwise, can't handle this. return true; } bool AArch64InstrInfo::reverseBranchCondition( SmallVectorImpl &Cond) const { if (Cond[0].getImm() != -1) { // Regular Bcc AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); } else { // Folded compare-and-branch switch (Cond[1].getImm()) { default: llvm_unreachable("Unknown conditional branch!"); case AArch64::CBZW: Cond[1].setImm(AArch64::CBNZW); break; case AArch64::CBNZW: Cond[1].setImm(AArch64::CBZW); break; case AArch64::CBZX: Cond[1].setImm(AArch64::CBNZX); break; case AArch64::CBNZX: Cond[1].setImm(AArch64::CBZX); break; case AArch64::TBZW: Cond[1].setImm(AArch64::TBNZW); break; case AArch64::TBNZW: Cond[1].setImm(AArch64::TBZW); break; case AArch64::TBZX: Cond[1].setImm(AArch64::TBNZX); break; case AArch64::TBNZX: Cond[1].setImm(AArch64::TBZX); break; } } return false; } unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); if (I == MBB.end()) return 0; if (!isUncondBranchOpcode(I->getOpcode()) && !isCondBranchOpcode(I->getOpcode())) return 0; // Remove the branch. I->eraseFromParent(); I = MBB.end(); if (I == MBB.begin()) { if (BytesRemoved) *BytesRemoved = 4; return 1; } --I; if (!isCondBranchOpcode(I->getOpcode())) { if (BytesRemoved) *BytesRemoved = 4; return 1; } // Remove the branch. I->eraseFromParent(); if (BytesRemoved) *BytesRemoved = 8; return 2; } void AArch64InstrInfo::instantiateCondBranch( MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, ArrayRef Cond) const { if (Cond[0].getImm() != -1) { // Regular Bcc BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); } else { // Folded compare-and-branch // Note that we use addOperand instead of addReg to keep the flags. const MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); if (Cond.size() > 3) MIB.addImm(Cond[3].getImm()); MIB.addMBB(TBB); } } unsigned AArch64InstrInfo::insertBranch( MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded) const { // Shouldn't be a fall through. assert(TBB && "insertBranch must not be told to insert a fallthrough"); if (!FBB) { if (Cond.empty()) // Unconditional branch? BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); else instantiateCondBranch(MBB, DL, TBB, Cond); if (BytesAdded) *BytesAdded = 4; return 1; } // Two-way conditional branch. instantiateCondBranch(MBB, DL, TBB, Cond); BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); if (BytesAdded) *BytesAdded = 8; return 2; } // Find the original register that VReg is copied from. static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { while (TargetRegisterInfo::isVirtualRegister(VReg)) { const MachineInstr *DefMI = MRI.getVRegDef(VReg); if (!DefMI->isFullCopy()) return VReg; VReg = DefMI->getOperand(1).getReg(); } return VReg; } // Determine if VReg is defined by an instruction that can be folded into a // csel instruction. If so, return the folded opcode, and the replacement // register. static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewVReg = nullptr) { VReg = removeCopies(MRI, VReg); if (!TargetRegisterInfo::isVirtualRegister(VReg)) return 0; bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); const MachineInstr *DefMI = MRI.getVRegDef(VReg); unsigned Opc = 0; unsigned SrcOpNum = 0; switch (DefMI->getOpcode()) { case AArch64::ADDSXri: case AArch64::ADDSWri: // if NZCV is used, do not fold. if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) return 0; // fall-through to ADDXri and ADDWri. LLVM_FALLTHROUGH; case AArch64::ADDXri: case AArch64::ADDWri: // add x, 1 -> csinc. if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || DefMI->getOperand(3).getImm() != 0) return 0; SrcOpNum = 1; Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; break; case AArch64::ORNXrr: case AArch64::ORNWrr: { // not x -> csinv, represented as orn dst, xzr, src. unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) return 0; SrcOpNum = 2; Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; break; } case AArch64::SUBSXrr: case AArch64::SUBSWrr: // if NZCV is used, do not fold. if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) return 0; // fall-through to SUBXrr and SUBWrr. LLVM_FALLTHROUGH; case AArch64::SUBXrr: case AArch64::SUBWrr: { // neg x -> csneg, represented as sub dst, xzr, src. unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) return 0; SrcOpNum = 2; Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; break; } default: return 0; } assert(Opc && SrcOpNum && "Missing parameters"); if (NewVReg) *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); return Opc; } bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, ArrayRef Cond, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *RC = RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); if (!RC) return false; // Expanding cbz/tbz requires an extra cycle of latency on the condition. unsigned ExtraCondLat = Cond.size() != 1; // GPRs are handled by csel. // FIXME: Fold in x+1, -x, and ~x when applicable. if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || AArch64::GPR32allRegClass.hasSubClassEq(RC)) { // Single-cycle csel, csinc, csinv, and csneg. CondCycles = 1 + ExtraCondLat; TrueCycles = FalseCycles = 1; if (canFoldIntoCSel(MRI, TrueReg)) TrueCycles = 0; else if (canFoldIntoCSel(MRI, FalseReg)) FalseCycles = 0; return true; } // Scalar floating point is handled by fcsel. // FIXME: Form fabs, fmin, and fmax when applicable. if (AArch64::FPR64RegClass.hasSubClassEq(RC) || AArch64::FPR32RegClass.hasSubClassEq(RC)) { CondCycles = 5 + ExtraCondLat; TrueCycles = FalseCycles = 2; return true; } // Can't do vectors. return false; } void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DstReg, ArrayRef Cond, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); // Parse the condition code, see parseCondBranch() above. AArch64CC::CondCode CC; switch (Cond.size()) { default: llvm_unreachable("Unknown condition opcode in Cond"); case 1: // b.cc CC = AArch64CC::CondCode(Cond[0].getImm()); break; case 3: { // cbz/cbnz // We must insert a compare against 0. bool Is64Bit; switch (Cond[1].getImm()) { default: llvm_unreachable("Unknown branch opcode in Cond"); case AArch64::CBZW: Is64Bit = false; CC = AArch64CC::EQ; break; case AArch64::CBZX: Is64Bit = true; CC = AArch64CC::EQ; break; case AArch64::CBNZW: Is64Bit = false; CC = AArch64CC::NE; break; case AArch64::CBNZX: Is64Bit = true; CC = AArch64CC::NE; break; } unsigned SrcReg = Cond[2].getReg(); if (Is64Bit) { // cmp reg, #0 is actually subs xzr, reg, #0. MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) .addReg(SrcReg) .addImm(0) .addImm(0); } else { MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) .addReg(SrcReg) .addImm(0) .addImm(0); } break; } case 4: { // tbz/tbnz // We must insert a tst instruction. switch (Cond[1].getImm()) { default: llvm_unreachable("Unknown branch opcode in Cond"); case AArch64::TBZW: case AArch64::TBZX: CC = AArch64CC::EQ; break; case AArch64::TBNZW: case AArch64::TBNZX: CC = AArch64CC::NE; break; } // cmp reg, #foo is actually ands xzr, reg, #1<> (64 - BitSize); uint64_t Encoding; return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); } // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { if (!Subtarget.hasCustomCheapAsMoveHandling()) return MI.isAsCheapAsAMove(); const unsigned Opcode = MI.getOpcode(); // Firstly, check cases gated by features. if (Subtarget.hasZeroCycleZeroingFP()) { if (Opcode == AArch64::FMOVH0 || Opcode == AArch64::FMOVS0 || Opcode == AArch64::FMOVD0) return true; } if (Subtarget.hasZeroCycleZeroingGP()) { if (Opcode == TargetOpcode::COPY && (MI.getOperand(1).getReg() == AArch64::WZR || MI.getOperand(1).getReg() == AArch64::XZR)) return true; } // Secondly, check cases specific to sub-targets. if (Subtarget.hasExynosCheapAsMoveHandling()) { if (isExynosCheapAsMove(MI)) return true; return MI.isAsCheapAsAMove(); } // Finally, check generic cases. switch (Opcode) { default: return false; // add/sub on register without shift case AArch64::ADDWri: case AArch64::ADDXri: case AArch64::SUBWri: case AArch64::SUBXri: return (MI.getOperand(3).getImm() == 0); // logical ops on immediate case AArch64::ANDWri: case AArch64::ANDXri: case AArch64::EORWri: case AArch64::EORXri: case AArch64::ORRWri: case AArch64::ORRXri: return true; // logical ops on register without shift case AArch64::ANDWrr: case AArch64::ANDXrr: case AArch64::BICWrr: case AArch64::BICXrr: case AArch64::EONWrr: case AArch64::EONXrr: case AArch64::EORWrr: case AArch64::EORXrr: case AArch64::ORNWrr: case AArch64::ORNXrr: case AArch64::ORRWrr: case AArch64::ORRXrr: return true; // If MOVi32imm or MOVi64imm can be expanded into ORRWri or // ORRXri, it is as cheap as MOV case AArch64::MOVi32imm: return canBeExpandedToORR(MI, 32); case AArch64::MOVi64imm: return canBeExpandedToORR(MI, 64); } llvm_unreachable("Unknown opcode to check as cheap as a move!"); } bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return false; case AArch64::ADDWrs: case AArch64::ADDXrs: case AArch64::ADDSWrs: case AArch64::ADDSXrs: { unsigned Imm = MI.getOperand(3).getImm(); unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); if (ShiftVal == 0) return true; return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; } case AArch64::ADDWrx: case AArch64::ADDXrx: case AArch64::ADDXrx64: case AArch64::ADDSWrx: case AArch64::ADDSXrx: case AArch64::ADDSXrx64: { unsigned Imm = MI.getOperand(3).getImm(); switch (AArch64_AM::getArithExtendType(Imm)) { default: return false; case AArch64_AM::UXTB: case AArch64_AM::UXTH: case AArch64_AM::UXTW: case AArch64_AM::UXTX: return AArch64_AM::getArithShiftValue(Imm) <= 4; } } case AArch64::SUBWrs: case AArch64::SUBSWrs: { unsigned Imm = MI.getOperand(3).getImm(); unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); return ShiftVal == 0 || (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); } case AArch64::SUBXrs: case AArch64::SUBSXrs: { unsigned Imm = MI.getOperand(3).getImm(); unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); return ShiftVal == 0 || (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); } case AArch64::SUBWrx: case AArch64::SUBXrx: case AArch64::SUBXrx64: case AArch64::SUBSWrx: case AArch64::SUBSXrx: case AArch64::SUBSXrx64: { unsigned Imm = MI.getOperand(3).getImm(); switch (AArch64_AM::getArithExtendType(Imm)) { default: return false; case AArch64_AM::UXTB: case AArch64_AM::UXTH: case AArch64_AM::UXTW: case AArch64_AM::UXTX: return AArch64_AM::getArithShiftValue(Imm) == 0; } } case AArch64::LDRBBroW: case AArch64::LDRBBroX: case AArch64::LDRBroW: case AArch64::LDRBroX: case AArch64::LDRDroW: case AArch64::LDRDroX: case AArch64::LDRHHroW: case AArch64::LDRHHroX: case AArch64::LDRHroW: case AArch64::LDRHroX: case AArch64::LDRQroW: case AArch64::LDRQroX: case AArch64::LDRSBWroW: case AArch64::LDRSBWroX: case AArch64::LDRSBXroW: case AArch64::LDRSBXroX: case AArch64::LDRSHWroW: case AArch64::LDRSHWroX: case AArch64::LDRSHXroW: case AArch64::LDRSHXroX: case AArch64::LDRSWroW: case AArch64::LDRSWroX: case AArch64::LDRSroW: case AArch64::LDRSroX: case AArch64::LDRWroW: case AArch64::LDRWroX: case AArch64::LDRXroW: case AArch64::LDRXroX: case AArch64::PRFMroW: case AArch64::PRFMroX: case AArch64::STRBBroW: case AArch64::STRBBroX: case AArch64::STRBroW: case AArch64::STRBroX: case AArch64::STRDroW: case AArch64::STRDroX: case AArch64::STRHHroW: case AArch64::STRHHroX: case AArch64::STRHroW: case AArch64::STRHroX: case AArch64::STRQroW: case AArch64::STRQroX: case AArch64::STRSroW: case AArch64::STRSroX: case AArch64::STRWroW: case AArch64::STRWroX: case AArch64::STRXroW: case AArch64::STRXroX: { unsigned IsSigned = MI.getOperand(3).getImm(); return !IsSigned; } } } bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); switch (Opc) { default: return false; case AArch64::SEH_StackAlloc: case AArch64::SEH_SaveFPLR: case AArch64::SEH_SaveFPLR_X: case AArch64::SEH_SaveReg: case AArch64::SEH_SaveReg_X: case AArch64::SEH_SaveRegP: case AArch64::SEH_SaveRegP_X: case AArch64::SEH_SaveFReg: case AArch64::SEH_SaveFReg_X: case AArch64::SEH_SaveFRegP: case AArch64::SEH_SaveFRegP_X: case AArch64::SEH_SetFP: case AArch64::SEH_AddFP: case AArch64::SEH_Nop: case AArch64::SEH_PrologEnd: case AArch64::SEH_EpilogStart: case AArch64::SEH_EpilogEnd: return true; } } bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const { switch (MI.getOpcode()) { default: return false; case AArch64::SBFMXri: // aka sxtw case AArch64::UBFMXri: // aka uxtw // Check for the 32 -> 64 bit extension case, these instructions can do // much more. if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) return false; // This is a signed or unsigned 32 -> 64 bit extension. SrcReg = MI.getOperand(1).getReg(); DstReg = MI.getOperand(0).getReg(); SubIdx = AArch64::sub_32; return true; } } bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const { const TargetRegisterInfo *TRI = &getRegisterInfo(); const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; int64_t OffsetA = 0, OffsetB = 0; unsigned WidthA = 0, WidthB = 0; assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; // Retrieve the base, offset from the base and width. Width // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If // base are identical, and the offset of a lower memory access + // the width doesn't overlap the offset of a higher memory access, // then the memory accesses are different. if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) && getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) { if (BaseOpA->isIdenticalTo(*BaseOpB)) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; if (LowOffset + LowWidth <= HighOffset) return true; } } return false; } bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const { if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) return true; switch (MI.getOpcode()) { case AArch64::HINT: // CSDB hints are scheduling barriers. if (MI.getOperand(0).getImm() == 0x14) return true; break; case AArch64::DSB: case AArch64::ISB: // DSB and ISB also are scheduling barriers. return true; default:; } return isSEHInstruction(MI); } /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const { // The first operand can be a frame index where we'd normally expect a // register. assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); if (!MI.getOperand(1).isReg()) return false; switch (MI.getOpcode()) { default: break; case AArch64::SUBSWrr: case AArch64::SUBSWrs: case AArch64::SUBSWrx: case AArch64::SUBSXrr: case AArch64::SUBSXrs: case AArch64::SUBSXrx: case AArch64::ADDSWrr: case AArch64::ADDSWrs: case AArch64::ADDSWrx: case AArch64::ADDSXrr: case AArch64::ADDSXrs: case AArch64::ADDSXrx: // Replace SUBSWrr with SUBWrr if NZCV is not used. SrcReg = MI.getOperand(1).getReg(); SrcReg2 = MI.getOperand(2).getReg(); CmpMask = ~0; CmpValue = 0; return true; case AArch64::SUBSWri: case AArch64::ADDSWri: case AArch64::SUBSXri: case AArch64::ADDSXri: SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; // FIXME: In order to convert CmpValue to 0 or 1 CmpValue = MI.getOperand(2).getImm() != 0; return true; case AArch64::ANDSWri: case AArch64::ANDSXri: // ANDS does not use the same encoding scheme as the others xxxS // instructions. SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; // FIXME:The return val type of decodeLogicalImmediate is uint64_t, // while the type of CmpValue is int. When converting uint64_t to int, // the high 32 bits of uint64_t will be lost. // In fact it causes a bug in spec2006-483.xalancbmk // CmpValue is only used to compare with zero in OptimizeCompareInstr CmpValue = AArch64_AM::decodeLogicalImmediate( MI.getOperand(2).getImm(), MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; return true; } return false; } static bool UpdateOperandRegClass(MachineInstr &Instr) { MachineBasicBlock *MBB = Instr.getParent(); assert(MBB && "Can't get MachineBasicBlock here"); MachineFunction *MF = MBB->getParent(); assert(MF && "Can't get MachineFunction here"); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; ++OpIdx) { MachineOperand &MO = Instr.getOperand(OpIdx); const TargetRegisterClass *OpRegCstraints = Instr.getRegClassConstraint(OpIdx, TII, TRI); // If there's no constraint, there's nothing to do. if (!OpRegCstraints) continue; // If the operand is a frame index, there's nothing to do here. // A frame index operand will resolve correctly during PEI. if (MO.isFI()) continue; assert(MO.isReg() && "Operand has register constraints without being a register!"); unsigned Reg = MO.getReg(); if (TargetRegisterInfo::isPhysicalRegister(Reg)) { if (!OpRegCstraints->contains(Reg)) return false; } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && !MRI->constrainRegClass(Reg, OpRegCstraints)) return false; } return true; } /// Return the opcode that does not set flags when possible - otherwise /// return the original opcode. The caller is responsible to do the actual /// substitution and legality checking. static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { // Don't convert all compare instructions, because for some the zero register // encoding becomes the sp register. bool MIDefinesZeroReg = false; if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) MIDefinesZeroReg = true; switch (MI.getOpcode()) { default: return MI.getOpcode(); case AArch64::ADDSWrr: return AArch64::ADDWrr; case AArch64::ADDSWri: return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; case AArch64::ADDSWrs: return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; case AArch64::ADDSWrx: return AArch64::ADDWrx; case AArch64::ADDSXrr: return AArch64::ADDXrr; case AArch64::ADDSXri: return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; case AArch64::ADDSXrs: return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; case AArch64::ADDSXrx: return AArch64::ADDXrx; case AArch64::SUBSWrr: return AArch64::SUBWrr; case AArch64::SUBSWri: return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; case AArch64::SUBSWrs: return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; case AArch64::SUBSWrx: return AArch64::SUBWrx; case AArch64::SUBSXrr: return AArch64::SUBXrr; case AArch64::SUBSXri: return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; case AArch64::SUBSXrs: return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; case AArch64::SUBSXrx: return AArch64::SUBXrx; } } enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; /// True when condition flags are accessed (either by writing or reading) /// on the instruction trace starting at From and ending at To. /// /// Note: If From and To are from different blocks it's assumed CC are accessed /// on the path. static bool areCFlagsAccessedBetweenInstrs( MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { // Early exit if To is at the beginning of the BB. if (To == To->getParent()->begin()) return true; // Check whether the instructions are in the same basic block // If not, assume the condition flags might get modified somewhere. if (To->getParent() != From->getParent()) return true; // From must be above To. assert(std::find_if(++To.getReverse(), To->getParent()->rend(), [From](MachineInstr &MI) { return MI.getIterator() == From; }) != To->getParent()->rend()); // We iterate backward starting \p To until we hit \p From. for (--To; To != From; --To) { const MachineInstr &Instr = *To; if (((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) || ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) return true; } return false; } /// Try to optimize a compare instruction. A compare instruction is an /// instruction which produces AArch64::NZCV. It can be truly compare /// instruction /// when there are no uses of its destination register. /// /// The following steps are tried in order: /// 1. Convert CmpInstr into an unconditional version. /// 2. Remove CmpInstr if above there is an instruction producing a needed /// condition code or an instruction which can be converted into such an /// instruction. /// Only comparison with zero is supported. bool AArch64InstrInfo::optimizeCompareInstr( MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { assert(CmpInstr.getParent()); assert(MRI); // Replace SUBSWrr with SUBWrr if NZCV is not used. int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); if (DeadNZCVIdx != -1) { if (CmpInstr.definesRegister(AArch64::WZR) || CmpInstr.definesRegister(AArch64::XZR)) { CmpInstr.eraseFromParent(); return true; } unsigned Opc = CmpInstr.getOpcode(); unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); if (NewOpc == Opc) return false; const MCInstrDesc &MCID = get(NewOpc); CmpInstr.setDesc(MCID); CmpInstr.RemoveOperand(DeadNZCVIdx); bool succeeded = UpdateOperandRegClass(CmpInstr); (void)succeeded; assert(succeeded && "Some operands reg class are incompatible!"); return true; } // Continue only if we have a "ri" where immediate is zero. // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare // function. assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!"); if (CmpValue != 0 || SrcReg2 != 0) return false; // CmpInstr is a Compare instruction if destination register is not used. if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) return false; return substituteCmpToZero(CmpInstr, SrcReg, MRI); } /// Get opcode of S version of Instr. /// If Instr is S version its opcode is returned. /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version /// or we are not interested in it. static unsigned sForm(MachineInstr &Instr) { switch (Instr.getOpcode()) { default: return AArch64::INSTRUCTION_LIST_END; case AArch64::ADDSWrr: case AArch64::ADDSWri: case AArch64::ADDSXrr: case AArch64::ADDSXri: case AArch64::SUBSWrr: case AArch64::SUBSWri: case AArch64::SUBSXrr: case AArch64::SUBSXri: return Instr.getOpcode(); case AArch64::ADDWrr: return AArch64::ADDSWrr; case AArch64::ADDWri: return AArch64::ADDSWri; case AArch64::ADDXrr: return AArch64::ADDSXrr; case AArch64::ADDXri: return AArch64::ADDSXri; case AArch64::ADCWr: return AArch64::ADCSWr; case AArch64::ADCXr: return AArch64::ADCSXr; case AArch64::SUBWrr: return AArch64::SUBSWrr; case AArch64::SUBWri: return AArch64::SUBSWri; case AArch64::SUBXrr: return AArch64::SUBSXrr; case AArch64::SUBXri: return AArch64::SUBSXri; case AArch64::SBCWr: return AArch64::SBCSWr; case AArch64::SBCXr: return AArch64::SBCSXr; case AArch64::ANDWri: return AArch64::ANDSWri; case AArch64::ANDXri: return AArch64::ANDSXri; } } /// Check if AArch64::NZCV should be alive in successors of MBB. static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) { for (auto *BB : MBB->successors()) if (BB->isLiveIn(AArch64::NZCV)) return true; return false; } namespace { struct UsedNZCV { bool N = false; bool Z = false; bool C = false; bool V = false; UsedNZCV() = default; UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { this->N |= UsedFlags.N; this->Z |= UsedFlags.Z; this->C |= UsedFlags.C; this->V |= UsedFlags.V; return *this; } }; } // end anonymous namespace /// Find a condition code used by the instruction. /// Returns AArch64CC::Invalid if either the instruction does not use condition /// codes or we don't optimize CmpInstr in the presence of such instructions. static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { switch (Instr.getOpcode()) { default: return AArch64CC::Invalid; case AArch64::Bcc: { int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); assert(Idx >= 2); return static_cast(Instr.getOperand(Idx - 2).getImm()); } case AArch64::CSINVWr: case AArch64::CSINVXr: case AArch64::CSINCWr: case AArch64::CSINCXr: case AArch64::CSELWr: case AArch64::CSELXr: case AArch64::CSNEGWr: case AArch64::CSNEGXr: case AArch64::FCSELSrrr: case AArch64::FCSELDrrr: { int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); assert(Idx >= 1); return static_cast(Instr.getOperand(Idx - 1).getImm()); } } } static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { assert(CC != AArch64CC::Invalid); UsedNZCV UsedFlags; switch (CC) { default: break; case AArch64CC::EQ: // Z set case AArch64CC::NE: // Z clear UsedFlags.Z = true; break; case AArch64CC::HI: // Z clear and C set case AArch64CC::LS: // Z set or C clear UsedFlags.Z = true; LLVM_FALLTHROUGH; case AArch64CC::HS: // C set case AArch64CC::LO: // C clear UsedFlags.C = true; break; case AArch64CC::MI: // N set case AArch64CC::PL: // N clear UsedFlags.N = true; break; case AArch64CC::VS: // V set case AArch64CC::VC: // V clear UsedFlags.V = true; break; case AArch64CC::GT: // Z clear, N and V the same case AArch64CC::LE: // Z set, N and V differ UsedFlags.Z = true; LLVM_FALLTHROUGH; case AArch64CC::GE: // N and V the same case AArch64CC::LT: // N and V differ UsedFlags.N = true; UsedFlags.V = true; break; } return UsedFlags; } static bool isADDSRegImm(unsigned Opcode) { return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; } static bool isSUBSRegImm(unsigned Opcode) { return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; } /// Check if CmpInstr can be substituted by MI. /// /// CmpInstr can be substituted: /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' /// - and, MI and CmpInstr are from the same MachineBB /// - and, condition flags are not alive in successors of the CmpInstr parent /// - and, if MI opcode is the S form there must be no defs of flags between /// MI and CmpInstr /// or if MI opcode is not the S form there must be neither defs of flags /// nor uses of flags between MI and CmpInstr. /// - and C/V flags are not used after CmpInstr static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, const TargetRegisterInfo *TRI) { assert(MI); assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END); assert(CmpInstr); const unsigned CmpOpcode = CmpInstr->getOpcode(); if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) return false; if (MI->getParent() != CmpInstr->getParent()) return false; if (areCFlagsAliveInSuccessors(CmpInstr->getParent())) return false; AccessKind AccessToCheck = AK_Write; if (sForm(*MI) != MI->getOpcode()) AccessToCheck = AK_All; if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck)) return false; UsedNZCV NZCVUsedAfterCmp; for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end(); I != E; ++I) { const MachineInstr &Instr = *I; if (Instr.readsRegister(AArch64::NZCV, TRI)) { AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); if (CC == AArch64CC::Invalid) // Unsupported conditional instruction return false; NZCVUsedAfterCmp |= getUsedNZCV(CC); } if (Instr.modifiesRegister(AArch64::NZCV, TRI)) break; } return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V; } /// Substitute an instruction comparing to zero with another instruction /// which produces needed condition flags. /// /// Return true on success. bool AArch64InstrInfo::substituteCmpToZero( MachineInstr &CmpInstr, unsigned SrcReg, const MachineRegisterInfo *MRI) const { assert(MRI); // Get the unique definition of SrcReg. MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); if (!MI) return false; const TargetRegisterInfo *TRI = &getRegisterInfo(); unsigned NewOpc = sForm(*MI); if (NewOpc == AArch64::INSTRUCTION_LIST_END) return false; if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI)) return false; // Update the instruction to set NZCV. MI->setDesc(get(NewOpc)); CmpInstr.eraseFromParent(); bool succeeded = UpdateOperandRegClass(*MI); (void)succeeded; assert(succeeded && "Some operands reg class are incompatible!"); MI->addRegisterDefined(AArch64::NZCV, TRI); return true; } bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && MI.getOpcode() != AArch64::CATCHRET) return false; MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); if (MI.getOpcode() == AArch64::CATCHRET) { // Skip to the first instruction before the epilog. const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); auto MBBI = MachineBasicBlock::iterator(MI); MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && FirstEpilogSEH != MBB.begin()) FirstEpilogSEH = std::prev(FirstEpilogSEH); if (FirstEpilogSEH != MBB.begin()) FirstEpilogSEH = std::next(FirstEpilogSEH); BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) .addReg(AArch64::X0, RegState::Define) .addMBB(TargetMBB); BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) .addReg(AArch64::X0, RegState::Define) .addReg(AArch64::X0) .addMBB(TargetMBB) .addImm(0); return true; } unsigned Reg = MI.getOperand(0).getReg(); const GlobalValue *GV = cast((*MI.memoperands_begin())->getValue()); const TargetMachine &TM = MBB.getParent()->getTarget(); unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); const unsigned char MO_NC = AArch64II::MO_NC; if ((OpFlags & AArch64II::MO_GOT) != 0) { BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) .addGlobalAddress(GV, 0, OpFlags); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) .addReg(Reg, RegState::Kill) .addImm(0) .addMemOperand(*MI.memoperands_begin()); } else if (TM.getCodeModel() == CodeModel::Large) { BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) .addImm(0); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) .addImm(16); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) .addImm(32); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, AArch64II::MO_G3) .addImm(48); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) .addReg(Reg, RegState::Kill) .addImm(0) .addMemOperand(*MI.memoperands_begin()); } else if (TM.getCodeModel() == CodeModel::Tiny) { BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) .addGlobalAddress(GV, 0, OpFlags); } else { BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, LoFlags) .addMemOperand(*MI.memoperands_begin()); } MBB.erase(MI); return true; } // Return true if this instruction simply sets its single destination register // to zero. This is equivalent to a register rename of the zero-register. bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { switch (MI.getOpcode()) { default: break; case AArch64::MOVZWi: case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); return true; } break; case AArch64::ANDWri: // and Rd, Rzr, #imm return MI.getOperand(1).getReg() == AArch64::WZR; case AArch64::ANDXri: return MI.getOperand(1).getReg() == AArch64::XZR; case TargetOpcode::COPY: return MI.getOperand(1).getReg() == AArch64::WZR; } return false; } // Return true if this instruction simply renames a general register without // modifying bits. bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { switch (MI.getOpcode()) { default: break; case TargetOpcode::COPY: { // GPR32 copies will by lowered to ORRXrs unsigned DstReg = MI.getOperand(0).getReg(); return (AArch64::GPR32RegClass.contains(DstReg) || AArch64::GPR64RegClass.contains(DstReg)); } case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) if (MI.getOperand(1).getReg() == AArch64::XZR) { assert(MI.getDesc().getNumOperands() == 4 && MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); return true; } break; case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) if (MI.getOperand(2).getImm() == 0) { assert(MI.getDesc().getNumOperands() == 4 && MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); return true; } break; } return false; } // Return true if this instruction simply renames a general register without // modifying bits. bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { switch (MI.getOpcode()) { default: break; case TargetOpcode::COPY: { // FPR64 copies will by lowered to ORR.16b unsigned DstReg = MI.getOperand(0).getReg(); return (AArch64::FPR64RegClass.contains(DstReg) || AArch64::FPR128RegClass.contains(DstReg)); } case AArch64::ORRv16i8: if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && "invalid ORRv16i8 operands"); return true; } break; } return false; } unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { switch (MI.getOpcode()) { default: break; case AArch64::LDRWui: case AArch64::LDRXui: case AArch64::LDRBui: case AArch64::LDRHui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { FrameIndex = MI.getOperand(1).getIndex(); return MI.getOperand(0).getReg(); } break; } return 0; } unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { switch (MI.getOpcode()) { default: break; case AArch64::STRWui: case AArch64::STRXui: case AArch64::STRBui: case AArch64::STRHui: case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { FrameIndex = MI.getOperand(1).getIndex(); return MI.getOperand(0).getReg(); } break; } return 0; } /// Check all MachineMemOperands for a hint to suppress pairing. bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { return MMO->getFlags() & MOSuppressPair; }); } /// Set a flag on the first MachineMemOperand to suppress pairing. void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { if (MI.memoperands_empty()) return; (*MI.memoperands_begin())->setFlags(MOSuppressPair); } /// Check all MachineMemOperands for a hint that the load/store is strided. bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { return MMO->getFlags() & MOStridedAccess; }); } bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) { switch (Opc) { default: return false; case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: case AArch64::STURBBi: case AArch64::STURHHi: case AArch64::STURWi: case AArch64::STURXi: case AArch64::LDURSi: case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: case AArch64::LDURHHi: case AArch64::LDURBBi: case AArch64::LDURSBWi: case AArch64::LDURSHWi: return true; } } Optional AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { switch (Opc) { default: return {}; case AArch64::PRFMui: return AArch64::PRFUMi; case AArch64::LDRXui: return AArch64::LDURXi; case AArch64::LDRWui: return AArch64::LDURWi; case AArch64::LDRBui: return AArch64::LDURBi; case AArch64::LDRHui: return AArch64::LDURHi; case AArch64::LDRSui: return AArch64::LDURSi; case AArch64::LDRDui: return AArch64::LDURDi; case AArch64::LDRQui: return AArch64::LDURQi; case AArch64::LDRBBui: return AArch64::LDURBBi; case AArch64::LDRHHui: return AArch64::LDURHHi; case AArch64::LDRSBXui: return AArch64::LDURSBXi; case AArch64::LDRSBWui: return AArch64::LDURSBWi; case AArch64::LDRSHXui: return AArch64::LDURSHXi; case AArch64::LDRSHWui: return AArch64::LDURSHWi; case AArch64::LDRSWui: return AArch64::LDURSWi; case AArch64::STRXui: return AArch64::STURXi; case AArch64::STRWui: return AArch64::STURWi; case AArch64::STRBui: return AArch64::STURBi; case AArch64::STRHui: return AArch64::STURHi; case AArch64::STRSui: return AArch64::STURSi; case AArch64::STRDui: return AArch64::STURDi; case AArch64::STRQui: return AArch64::STURQi; case AArch64::STRBBui: return AArch64::STURBBi; case AArch64::STRHHui: return AArch64::STURHHi; } } unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { switch (Opc) { default: return 2; case AArch64::LDPXi: case AArch64::LDPDi: case AArch64::STPXi: case AArch64::STPDi: case AArch64::LDNPXi: case AArch64::LDNPDi: case AArch64::STNPXi: case AArch64::STNPDi: case AArch64::LDPQi: case AArch64::STPQi: case AArch64::LDNPQi: case AArch64::STNPQi: case AArch64::LDPWi: case AArch64::LDPSi: case AArch64::STPWi: case AArch64::STPSi: case AArch64::LDNPWi: case AArch64::LDNPSi: case AArch64::STNPWi: case AArch64::STNPSi: case AArch64::LDG: case AArch64::STGPi: return 3; case AArch64::ADDG: case AArch64::STGOffset: return 2; } } bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return false; // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: case AArch64::STRXui: case AArch64::STRWui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRSWui: // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: case AArch64::STURWi: case AArch64::STURXi: case AArch64::LDURSi: case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: return true; } } unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) { switch (Opc) { default: llvm_unreachable("Opcode has no flag setting equivalent!"); // 32-bit cases: case AArch64::ADDWri: Is64Bit = false; return AArch64::ADDSWri; case AArch64::ADDWrr: Is64Bit = false; return AArch64::ADDSWrr; case AArch64::ADDWrs: Is64Bit = false; return AArch64::ADDSWrs; case AArch64::ADDWrx: Is64Bit = false; return AArch64::ADDSWrx; case AArch64::ANDWri: Is64Bit = false; return AArch64::ANDSWri; case AArch64::ANDWrr: Is64Bit = false; return AArch64::ANDSWrr; case AArch64::ANDWrs: Is64Bit = false; return AArch64::ANDSWrs; case AArch64::BICWrr: Is64Bit = false; return AArch64::BICSWrr; case AArch64::BICWrs: Is64Bit = false; return AArch64::BICSWrs; case AArch64::SUBWri: Is64Bit = false; return AArch64::SUBSWri; case AArch64::SUBWrr: Is64Bit = false; return AArch64::SUBSWrr; case AArch64::SUBWrs: Is64Bit = false; return AArch64::SUBSWrs; case AArch64::SUBWrx: Is64Bit = false; return AArch64::SUBSWrx; // 64-bit cases: case AArch64::ADDXri: Is64Bit = true; return AArch64::ADDSXri; case AArch64::ADDXrr: Is64Bit = true; return AArch64::ADDSXrr; case AArch64::ADDXrs: Is64Bit = true; return AArch64::ADDSXrs; case AArch64::ADDXrx: Is64Bit = true; return AArch64::ADDSXrx; case AArch64::ANDXri: Is64Bit = true; return AArch64::ANDSXri; case AArch64::ANDXrr: Is64Bit = true; return AArch64::ANDSXrr; case AArch64::ANDXrs: Is64Bit = true; return AArch64::ANDSXrs; case AArch64::BICXrr: Is64Bit = true; return AArch64::BICSXrr; case AArch64::BICXrs: Is64Bit = true; return AArch64::BICSXrs; case AArch64::SUBXri: Is64Bit = true; return AArch64::SUBSXri; case AArch64::SUBXrr: Is64Bit = true; return AArch64::SUBSXrr; case AArch64::SUBXrs: Is64Bit = true; return AArch64::SUBSXrs; case AArch64::SUBXrx: Is64Bit = true; return AArch64::SUBSXrx; } } // Is this a candidate for ld/st merging or pairing? For example, we don't // touch volatiles or load/stores that have a hint to avoid pair formation. bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { // If this is a volatile load/store, don't mess with it. if (MI.hasOrderedMemoryRef()) return false; // Make sure this is a reg/fi+imm (as opposed to an address reloc). assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) && "Expected a reg or frame index operand."); if (!MI.getOperand(2).isImm()) return false; // Can't merge/pair if the instruction modifies the base register. // e.g., ldr x0, [x0] // This case will never occur with an FI base. if (MI.getOperand(1).isReg()) { unsigned BaseReg = MI.getOperand(1).getReg(); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (MI.modifiesRegister(BaseReg, TRI)) return false; } // Check if this load/store has a hint to avoid pair formation. // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. if (isLdStPairSuppressed(MI)) + return false; + + // Do not pair any callee-save store/reload instructions in the + // prologue/epilogue if the CFI information encoded the operations as separate + // instructions, as that will cause the size of the actual prologue to mismatch + // with the prologue size recorded in the Windows CFI. + const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); + bool NeedsWinCFI = MAI->usesWindowsCFI() && + MI.getMF()->getFunction().needsUnwindTableEntry(); + if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy))) return false; // On some CPUs quad load/store pairs are slower than two single load/stores. if (Subtarget.isPaired128Slow()) { switch (MI.getOpcode()) { default: break; case AArch64::LDURQi: case AArch64::STURQi: case AArch64::LDRQui: case AArch64::STRQui: return false; } } return true; } bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const { unsigned Width; return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI); } bool AArch64InstrInfo::getMemOperandWithOffsetWidth( const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const { assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); // Handle only loads/stores with base register followed by immediate offset. if (LdSt.getNumExplicitOperands() == 3) { // Non-paired instruction (e.g., ldr x1, [x0, #8]). if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || !LdSt.getOperand(2).isImm()) return false; } else if (LdSt.getNumExplicitOperands() == 4) { // Paired instruction (e.g., ldp x1, x2, [x0, #8]). if (!LdSt.getOperand(1).isReg() || (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || !LdSt.getOperand(3).isImm()) return false; } else return false; // Get the scaling factor for the instruction and set the width for the // instruction. unsigned Scale = 0; int64_t Dummy1, Dummy2; // If this returns false, then it's an instruction we don't want to handle. if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) return false; // Compute the offset. Offset is calculated as the immediate operand // multiplied by the scaling factor. Unscaled instructions have scaling factor // set to 1. if (LdSt.getNumExplicitOperands() == 3) { BaseOp = &LdSt.getOperand(1); Offset = LdSt.getOperand(2).getImm() * Scale; } else { assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); BaseOp = &LdSt.getOperand(2); Offset = LdSt.getOperand(3).getImm() * Scale; } assert((BaseOp->isReg() || BaseOp->isFI()) && "getMemOperandWithOffset only supports base " "operands of type register or frame index."); return true; } MachineOperand & AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); assert(OfsOp.isImm() && "Offset operand wasn't immediate."); return OfsOp; } bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, int64_t &MinOffset, int64_t &MaxOffset) { switch (Opcode) { // Not a memory operation or something we want to handle. default: Scale = Width = 0; MinOffset = MaxOffset = 0; return false; case AArch64::STRWpost: case AArch64::LDRWpost: Width = 32; Scale = 4; MinOffset = -256; MaxOffset = 255; break; case AArch64::LDURQi: case AArch64::STURQi: Width = 16; Scale = 1; MinOffset = -256; MaxOffset = 255; break; case AArch64::PRFUMi: case AArch64::LDURXi: case AArch64::LDURDi: case AArch64::STURXi: case AArch64::STURDi: Width = 8; Scale = 1; MinOffset = -256; MaxOffset = 255; break; case AArch64::LDURWi: case AArch64::LDURSi: case AArch64::LDURSWi: case AArch64::STURWi: case AArch64::STURSi: Width = 4; Scale = 1; MinOffset = -256; MaxOffset = 255; break; case AArch64::LDURHi: case AArch64::LDURHHi: case AArch64::LDURSHXi: case AArch64::LDURSHWi: case AArch64::STURHi: case AArch64::STURHHi: Width = 2; Scale = 1; MinOffset = -256; MaxOffset = 255; break; case AArch64::LDURBi: case AArch64::LDURBBi: case AArch64::LDURSBXi: case AArch64::LDURSBWi: case AArch64::STURBi: case AArch64::STURBBi: Width = 1; Scale = 1; MinOffset = -256; MaxOffset = 255; break; case AArch64::LDPQi: case AArch64::LDNPQi: case AArch64::STPQi: case AArch64::STNPQi: Scale = 16; Width = 32; MinOffset = -64; MaxOffset = 63; break; case AArch64::LDRQui: case AArch64::STRQui: Scale = Width = 16; MinOffset = 0; MaxOffset = 4095; break; case AArch64::LDPXi: case AArch64::LDPDi: case AArch64::LDNPXi: case AArch64::LDNPDi: case AArch64::STPXi: case AArch64::STPDi: case AArch64::STNPXi: case AArch64::STNPDi: Scale = 8; Width = 16; MinOffset = -64; MaxOffset = 63; break; case AArch64::PRFMui: case AArch64::LDRXui: case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: Scale = Width = 8; MinOffset = 0; MaxOffset = 4095; break; case AArch64::LDPWi: case AArch64::LDPSi: case AArch64::LDNPWi: case AArch64::LDNPSi: case AArch64::STPWi: case AArch64::STPSi: case AArch64::STNPWi: case AArch64::STNPSi: Scale = 4; Width = 8; MinOffset = -64; MaxOffset = 63; break; case AArch64::LDRWui: case AArch64::LDRSui: case AArch64::LDRSWui: case AArch64::STRWui: case AArch64::STRSui: Scale = Width = 4; MinOffset = 0; MaxOffset = 4095; break; case AArch64::LDRHui: case AArch64::LDRHHui: case AArch64::LDRSHWui: case AArch64::LDRSHXui: case AArch64::STRHui: case AArch64::STRHHui: Scale = Width = 2; MinOffset = 0; MaxOffset = 4095; break; case AArch64::LDRBui: case AArch64::LDRBBui: case AArch64::LDRSBWui: case AArch64::LDRSBXui: case AArch64::STRBui: case AArch64::STRBBui: Scale = Width = 1; MinOffset = 0; MaxOffset = 4095; break; case AArch64::ADDG: case AArch64::TAGPstack: Scale = 16; Width = 0; MinOffset = 0; MaxOffset = 63; break; case AArch64::LDG: case AArch64::STGOffset: case AArch64::STZGOffset: Scale = Width = 16; MinOffset = -256; MaxOffset = 255; break; case AArch64::ST2GOffset: case AArch64::STZ2GOffset: Scale = 16; Width = 32; MinOffset = -256; MaxOffset = 255; break; case AArch64::STGPi: Scale = Width = 16; MinOffset = -64; MaxOffset = 63; break; } return true; } static unsigned getOffsetStride(unsigned Opc) { switch (Opc) { default: return 0; case AArch64::LDURQi: case AArch64::STURQi: return 16; case AArch64::LDURXi: case AArch64::LDURDi: case AArch64::STURXi: case AArch64::STURDi: return 8; case AArch64::LDURWi: case AArch64::LDURSi: case AArch64::LDURSWi: case AArch64::STURWi: case AArch64::STURSi: return 4; } } // Scale the unscaled offsets. Returns false if the unscaled offset can't be // scaled. static bool scaleOffset(unsigned Opc, int64_t &Offset) { unsigned OffsetStride = getOffsetStride(Opc); if (OffsetStride == 0) return false; // If the byte-offset isn't a multiple of the stride, we can't scale this // offset. if (Offset % OffsetStride != 0) return false; // Convert the byte-offset used by unscaled into an "element" offset used // by the scaled pair load/store instructions. Offset /= OffsetStride; return true; } // Unscale the scaled offsets. Returns false if the scaled offset can't be // unscaled. static bool unscaleOffset(unsigned Opc, int64_t &Offset) { unsigned OffsetStride = getOffsetStride(Opc); if (OffsetStride == 0) return false; // Convert the "element" offset used by scaled pair load/store instructions // into the byte-offset used by unscaled. Offset *= OffsetStride; return true; } static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { if (FirstOpc == SecondOpc) return true; // We can also pair sign-ext and zero-ext instructions. switch (FirstOpc) { default: return false; case AArch64::LDRWui: case AArch64::LDURWi: return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; case AArch64::LDRSWui: case AArch64::LDURSWi: return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; } // These instructions can't be paired based on their opcodes. return false; } static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2) { // Accesses through fixed stack object frame indices may access a different // fixed stack slot. Check that the object offsets + offsets match. if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); // Get the byte-offset from the object offset. if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2)) return false; ObjectOffset1 += Offset1; ObjectOffset2 += Offset2; // Get the "element" index in the object. if (!scaleOffset(Opcode1, ObjectOffset1) || !scaleOffset(Opcode2, ObjectOffset2)) return false; return ObjectOffset1 + 1 == ObjectOffset2; } return FI1 == FI2; } /// Detect opportunities for ldp/stp formation. /// /// Only called for LdSt for which getMemOperandWithOffset returns true. bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, const MachineOperand &BaseOp2, unsigned NumLoads) const { const MachineInstr &FirstLdSt = *BaseOp1.getParent(); const MachineInstr &SecondLdSt = *BaseOp2.getParent(); if (BaseOp1.getType() != BaseOp2.getType()) return false; assert((BaseOp1.isReg() || BaseOp1.isFI()) && "Only base registers and frame indices are supported."); // Check for both base regs and base FI. if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) return false; // Only cluster up to a single pair. if (NumLoads > 1) return false; if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) return false; // Can we pair these instructions based on their opcodes? unsigned FirstOpc = FirstLdSt.getOpcode(); unsigned SecondOpc = SecondLdSt.getOpcode(); if (!canPairLdStOpc(FirstOpc, SecondOpc)) return false; // Can't merge volatiles or load/stores that have a hint to avoid pair // formation, for example. if (!isCandidateToMergeOrPair(FirstLdSt) || !isCandidateToMergeOrPair(SecondLdSt)) return false; // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) return false; int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) return false; // Pairwise instructions have a 7-bit signed offset field. if (Offset1 > 63 || Offset1 < -64) return false; // The caller should already have ordered First/SecondLdSt by offset. // Note: except for non-equal frame index bases if (BaseOp1.isFI()) { assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) && "Caller should have ordered offsets."); const MachineFrameInfo &MFI = FirstLdSt.getParent()->getParent()->getFrameInfo(); return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, BaseOp2.getIndex(), Offset2, SecondOpc); } assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && "Caller should have ordered offsets."); return Offset1 + 1 == Offset2; } static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, unsigned Reg, unsigned SubIdx, unsigned State, const TargetRegisterInfo *TRI) { if (!SubIdx) return MIB.addReg(Reg, State); if (TargetRegisterInfo::isPhysicalRegister(Reg)) return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); return MIB.addReg(Reg, State, SubIdx); } static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs) { // We really want the positive remainder mod 32 here, that happens to be // easily obtainable with a mask. return ((DestReg - SrcReg) & 0x1f) < NumRegs; } void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, ArrayRef Indices) const { assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); const TargetRegisterInfo *TRI = &getRegisterInfo(); uint16_t DestEncoding = TRI->getEncodingValue(DestReg); uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); unsigned NumRegs = Indices.size(); int SubReg = 0, End = NumRegs, Incr = 1; if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { SubReg = NumRegs - 1; End = -1; Incr = -1; } for (; SubReg != End; SubReg += Incr) { const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); } } void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef Indices) const { const TargetRegisterInfo *TRI = &getRegisterInfo(); unsigned NumRegs = Indices.size(); #ifndef NDEBUG uint16_t DestEncoding = TRI->getEncodingValue(DestReg); uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && "GPR reg sequences should not be able to overlap"); #endif for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); MIB.addReg(ZeroReg); AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); MIB.addImm(0); } } void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { if (AArch64::GPR32spRegClass.contains(DestReg) && (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { const TargetRegisterInfo *TRI = &getRegisterInfo(); if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { // If either operand is WSP, expand to ADD #0. if (Subtarget.hasZeroCycleRegMove()) { // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); // This instruction is reading and writing X registers. This may upset // the register scavenger and machine verifier, so we need to indicate // that we are reading an undefined value from SrcRegX, but a proper // value from SrcReg. BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) .addReg(SrcRegX, RegState::Undef) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); } else { BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else { if (Subtarget.hasZeroCycleRegMove()) { // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); // This instruction is reading and writing X registers. This may upset // the register scavenger and machine verifier, so we need to indicate // that we are reading an undefined value from SrcRegX, but a proper // value from SrcReg. BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) .addReg(AArch64::XZR) .addReg(SrcRegX, RegState::Undef) .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); } else { // Otherwise, expand to ORR WZR. BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) .addReg(AArch64::WZR) .addReg(SrcReg, getKillRegState(KillSrc)); } } return; } if (AArch64::GPR64spRegClass.contains(DestReg) && (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { // If either operand is SP, expand to ADD #0. BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else { // Otherwise, expand to ORR XZR. BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) .addReg(AArch64::XZR) .addReg(SrcReg, getKillRegState(KillSrc)); } return; } // Copy a DDDD register quad by copying the individual sub-registers. if (AArch64::DDDDRegClass.contains(DestReg) && AArch64::DDDDRegClass.contains(SrcReg)) { static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, AArch64::dsub2, AArch64::dsub3}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, Indices); return; } // Copy a DDD register triple by copying the individual sub-registers. if (AArch64::DDDRegClass.contains(DestReg) && AArch64::DDDRegClass.contains(SrcReg)) { static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, AArch64::dsub2}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, Indices); return; } // Copy a DD register pair by copying the individual sub-registers. if (AArch64::DDRegClass.contains(DestReg) && AArch64::DDRegClass.contains(SrcReg)) { static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, Indices); return; } // Copy a QQQQ register quad by copying the individual sub-registers. if (AArch64::QQQQRegClass.contains(DestReg) && AArch64::QQQQRegClass.contains(SrcReg)) { static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, AArch64::qsub3}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, Indices); return; } // Copy a QQQ register triple by copying the individual sub-registers. if (AArch64::QQQRegClass.contains(DestReg) && AArch64::QQQRegClass.contains(SrcReg)) { static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, AArch64::qsub2}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, Indices); return; } // Copy a QQ register pair by copying the individual sub-registers. if (AArch64::QQRegClass.contains(DestReg) && AArch64::QQRegClass.contains(SrcReg)) { static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, Indices); return; } if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, AArch64::XZR, Indices); return; } if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, AArch64::WZR, Indices); return; } if (AArch64::FPR128RegClass.contains(DestReg) && AArch64::FPR128RegClass.contains(SrcReg)) { if (Subtarget.hasNEON()) { BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else { BuildMI(MBB, I, DL, get(AArch64::STRQpre)) .addReg(AArch64::SP, RegState::Define) .addReg(SrcReg, getKillRegState(KillSrc)) .addReg(AArch64::SP) .addImm(-16); BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) .addReg(AArch64::SP, RegState::Define) .addReg(DestReg, RegState::Define) .addReg(AArch64::SP) .addImm(16); } return; } if (AArch64::FPR64RegClass.contains(DestReg) && AArch64::FPR64RegClass.contains(SrcReg)) { if (Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, &AArch64::FPR128RegClass); BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else { BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); } return; } if (AArch64::FPR32RegClass.contains(DestReg) && AArch64::FPR32RegClass.contains(SrcReg)) { if (Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, &AArch64::FPR128RegClass); BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else { BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); } return; } if (AArch64::FPR16RegClass.contains(DestReg) && AArch64::FPR16RegClass.contains(SrcReg)) { if (Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR128RegClass); BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); } return; } if (AArch64::FPR8RegClass.contains(DestReg) && AArch64::FPR8RegClass.contains(SrcReg)) { if (Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR128RegClass); BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); } return; } // Copies between GPR64 and FPR64. if (AArch64::FPR64RegClass.contains(DestReg) && AArch64::GPR64RegClass.contains(SrcReg)) { BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } if (AArch64::GPR64RegClass.contains(DestReg) && AArch64::FPR64RegClass.contains(SrcReg)) { BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } // Copies between GPR32 and FPR32. if (AArch64::FPR32RegClass.contains(DestReg) && AArch64::GPR32RegClass.contains(SrcReg)) { BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } if (AArch64::GPR32RegClass.contains(DestReg) && AArch64::FPR32RegClass.contains(SrcReg)) { BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } if (DestReg == AArch64::NZCV) { assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); BuildMI(MBB, I, DL, get(AArch64::MSR)) .addImm(AArch64SysReg::NZCV) .addReg(SrcReg, getKillRegState(KillSrc)) .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); return; } if (SrcReg == AArch64::NZCV) { assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) .addImm(AArch64SysReg::NZCV) .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); return; } llvm_unreachable("unimplemented reg-to-reg copy"); } static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, unsigned SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO) { unsigned SrcReg0 = SrcReg; unsigned SrcReg1 = SrcReg; if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) { SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); SubIdx0 = 0; SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); SubIdx1 = 0; } BuildMI(MBB, InsertBefore, DebugLoc(), MCID) .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) .addFrameIndex(FI) .addImm(0) .addMemOperand(MMO); } void AArch64InstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); unsigned Opc = 0; bool Offset = true; switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::STRBui; break; case 2: if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::STRHui; break; case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { Opc = AArch64::STRWui; if (TargetRegisterInfo::isVirtualRegister(SrcReg)) MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); else assert(SrcReg != AArch64::WSP); } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) Opc = AArch64::STRSui; break; case 8: if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { Opc = AArch64::STRXui; if (TargetRegisterInfo::isVirtualRegister(SrcReg)) MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); else assert(SrcReg != AArch64::SP); } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { Opc = AArch64::STRDui; } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, get(AArch64::STPWi), SrcReg, isKill, AArch64::sube32, AArch64::subo32, FI, MMO); return; } break; case 16: if (AArch64::FPR128RegClass.hasSubClassEq(RC)) Opc = AArch64::STRQui; else if (AArch64::DDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov1d; Offset = false; } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, get(AArch64::STPXi), SrcReg, isKill, AArch64::sube64, AArch64::subo64, FI, MMO); return; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Threev1d; Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv1d; Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov2d; Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Threev2d; Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv2d; Offset = false; } break; } assert(Opc && "Unknown register class"); const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI); if (Offset) MI.addImm(0); MI.addMemOperand(MMO); } static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, unsigned DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO) { unsigned DestReg0 = DestReg; unsigned DestReg1 = DestReg; bool IsUndef = true; if (TargetRegisterInfo::isPhysicalRegister(DestReg)) { DestReg0 = TRI.getSubReg(DestReg, SubIdx0); SubIdx0 = 0; DestReg1 = TRI.getSubReg(DestReg, SubIdx1); SubIdx1 = 0; IsUndef = false; } BuildMI(MBB, InsertBefore, DebugLoc(), MCID) .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) .addFrameIndex(FI) .addImm(0) .addMemOperand(MMO); } void AArch64InstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); unsigned Opc = 0; bool Offset = true; switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRBui; break; case 2: if (AArch64::FPR16RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRHui; break; case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { Opc = AArch64::LDRWui; if (TargetRegisterInfo::isVirtualRegister(DestReg)) MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); else assert(DestReg != AArch64::WSP); } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRSui; break; case 8: if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { Opc = AArch64::LDRXui; if (TargetRegisterInfo::isVirtualRegister(DestReg)) MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); else assert(DestReg != AArch64::SP); } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { Opc = AArch64::LDRDui; } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, get(AArch64::LDPWi), DestReg, AArch64::sube32, AArch64::subo32, FI, MMO); return; } break; case 16: if (AArch64::FPR128RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRQui; else if (AArch64::DDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov1d; Offset = false; } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, get(AArch64::LDPXi), DestReg, AArch64::sube64, AArch64::subo64, FI, MMO); return; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Threev1d; Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv1d; Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov2d; Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Threev2d; Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv2d; Offset = false; } break; } assert(Opc && "Unknown register class"); const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) .addReg(DestReg, getDefRegState(true)) .addFrameIndex(FI); if (Offset) MI.addImm(0); MI.addMemOperand(MMO); } void llvm::emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV, bool NeedsWinCFI, bool *HasWinCFI) { if (DestReg == SrcReg && Offset == 0) return; assert((DestReg != AArch64::SP || Offset % 16 == 0) && "SP increment/decrement not 16-byte aligned"); bool isSub = Offset < 0; if (isSub) Offset = -Offset; // FIXME: If the offset won't fit in 24-bits, compute the offset into a // scratch register. If DestReg is a virtual register, use it as the // scratch register; otherwise, create a new virtual register (to be // replaced by the scavenger at the end of PEI). That case can be optimized // slightly if DestReg is SP which is always 16-byte aligned, so the scratch // register can be loaded with offset%8 and the add/sub can use an extending // instruction with LSL#3. // Currently the function handles any offsets but generates a poor sequence // of code. // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); unsigned Opc; if (SetNZCV) Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri; else Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri; const unsigned MaxEncoding = 0xfff; const unsigned ShiftSize = 12; const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; while (((unsigned)Offset) >= (1 << ShiftSize)) { unsigned ThisVal; if (((unsigned)Offset) > MaxEncodableValue) { ThisVal = MaxEncodableValue; } else { ThisVal = Offset & MaxEncodableValue; } assert((ThisVal >> ShiftSize) <= MaxEncoding && "Encoding cannot handle value that big"); BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) .addReg(SrcReg) .addImm(ThisVal >> ShiftSize) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize)) .setMIFlag(Flag); if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) { if (HasWinCFI) *HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) .addImm(ThisVal) .setMIFlag(Flag); } SrcReg = DestReg; Offset -= ThisVal; if (Offset == 0) return; } BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) .addReg(SrcReg) .addImm(Offset) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) .setMIFlag(Flag); if (NeedsWinCFI) { if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { if (HasWinCFI) *HasWinCFI = true; if (Offset == 0) BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)). setMIFlag(Flag); else BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)). addImm(Offset).setMIFlag(Flag); } else if (DestReg == AArch64::SP) { if (HasWinCFI) *HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)). addImm(Offset).setMIFlag(Flag); } } } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, VirtRegMap *VRM) const { // This is a bit of a hack. Consider this instruction: // // %0 = COPY %sp; GPR64all:%0 // // We explicitly chose GPR64all for the virtual register so such a copy might // be eliminated by RegisterCoalescer. However, that may not be possible, and // %0 may even spill. We can't spill %sp, and since it is in the GPR64all // register class, TargetInstrInfo::foldMemoryOperand() is going to try. // // To prevent that, we are going to constrain the %0 register class here. // // // if (MI.isFullCopy()) { unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); if (SrcReg == AArch64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) { MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); return nullptr; } if (DstReg == AArch64::SP && TargetRegisterInfo::isVirtualRegister(SrcReg)) { MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); return nullptr; } } // Handle the case where a copy is being spilled or filled but the source // and destination register class don't match. For example: // // %0 = COPY %xzr; GPR64common:%0 // // In this case we can still safely fold away the COPY and generate the // following spill code: // // STRXui %xzr, %stack.0 // // This also eliminates spilled cross register class COPYs (e.g. between x and // d regs) of the same size. For example: // // %0 = COPY %1; GPR64:%0, FPR64:%1 // // will be filled as // // LDRDui %0, fi<#0> // // instead of // // LDRXui %Temp, fi<#0> // %0 = FMOV %Temp // if (MI.isCopy() && Ops.size() == 1 && // Make sure we're only folding the explicit COPY defs/uses. (Ops[0] == 0 || Ops[0] == 1)) { bool IsSpill = Ops[0] == 0; bool IsFill = !IsSpill; const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock &MBB = *MI.getParent(); const MachineOperand &DstMO = MI.getOperand(0); const MachineOperand &SrcMO = MI.getOperand(1); unsigned DstReg = DstMO.getReg(); unsigned SrcReg = SrcMO.getReg(); // This is slightly expensive to compute for physical regs since // getMinimalPhysRegClass is slow. auto getRegClass = [&](unsigned Reg) { return TargetRegisterInfo::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) : TRI.getMinimalPhysRegClass(Reg); }; if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == TRI.getRegSizeInBits(*getRegClass(SrcReg)) && "Mismatched register size in non subreg COPY"); if (IsSpill) storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, getRegClass(SrcReg), &TRI); else loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, getRegClass(DstReg), &TRI); return &*--InsertPt; } // Handle cases like spilling def of: // // %0:sub_32 = COPY %wzr; GPR64common:%0 // // where the physical register source can be widened and stored to the full // virtual reg destination stack slot, in this case producing: // // STRXui %xzr, %stack.0 // if (IsSpill && DstMO.isUndef() && TargetRegisterInfo::isPhysicalRegister(SrcReg)) { assert(SrcMO.getSubReg() == 0 && "Unexpected subreg on physical register"); const TargetRegisterClass *SpillRC; unsigned SpillSubreg; switch (DstMO.getSubReg()) { default: SpillRC = nullptr; break; case AArch64::sub_32: case AArch64::ssub: if (AArch64::GPR32RegClass.contains(SrcReg)) { SpillRC = &AArch64::GPR64RegClass; SpillSubreg = AArch64::sub_32; } else if (AArch64::FPR32RegClass.contains(SrcReg)) { SpillRC = &AArch64::FPR64RegClass; SpillSubreg = AArch64::ssub; } else SpillRC = nullptr; break; case AArch64::dsub: if (AArch64::FPR64RegClass.contains(SrcReg)) { SpillRC = &AArch64::FPR128RegClass; SpillSubreg = AArch64::dsub; } else SpillRC = nullptr; break; } if (SpillRC) if (unsigned WidenedSrcReg = TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), FrameIndex, SpillRC, &TRI); return &*--InsertPt; } } // Handle cases like filling use of: // // %0:sub_32 = COPY %1; GPR64:%0, GPR32:%1 // // where we can load the full virtual reg source stack slot, into the subreg // destination, in this case producing: // // LDRWui %0:sub_32, %stack.0 // if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { const TargetRegisterClass *FillRC; switch (DstMO.getSubReg()) { default: FillRC = nullptr; break; case AArch64::sub_32: FillRC = &AArch64::GPR32RegClass; break; case AArch64::ssub: FillRC = &AArch64::FPR32RegClass; break; case AArch64::dsub: FillRC = &AArch64::FPR64RegClass; break; } if (FillRC) { assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == TRI.getRegSizeInBits(*FillRC) && "Mismatched regclass size on folded subreg COPY"); loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); MachineInstr &LoadMI = *--InsertPt; MachineOperand &LoadDst = LoadMI.getOperand(0); assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); LoadDst.setSubReg(DstMO.getSubReg()); LoadDst.setIsUndef(); return &LoadMI; } } } // Cannot fold. return nullptr; } int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, bool *OutUseUnscaledOp, unsigned *OutUnscaledOp, int *EmittableOffset) { // Set output values in case of early exit. if (EmittableOffset) *EmittableOffset = 0; if (OutUseUnscaledOp) *OutUseUnscaledOp = false; if (OutUnscaledOp) *OutUnscaledOp = 0; // Exit early for structured vector spills/fills as they can't take an // immediate offset. switch (MI.getOpcode()) { default: break; case AArch64::LD1Twov2d: case AArch64::LD1Threev2d: case AArch64::LD1Fourv2d: case AArch64::LD1Twov1d: case AArch64::LD1Threev1d: case AArch64::LD1Fourv1d: case AArch64::ST1Twov2d: case AArch64::ST1Threev2d: case AArch64::ST1Fourv2d: case AArch64::ST1Twov1d: case AArch64::ST1Threev1d: case AArch64::ST1Fourv1d: case AArch64::IRG: case AArch64::IRGstack: return AArch64FrameOffsetCannotUpdate; } // Get the min/max offset and the scale. unsigned Scale, Width; int64_t MinOff, MaxOff; if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff, MaxOff)) llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); // Construct the complete offset. const MachineOperand &ImmOpnd = MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); Offset += ImmOpnd.getImm() * Scale; // If the offset doesn't match the scale, we rewrite the instruction to // use the unscaled instruction instead. Likewise, if we have a negative // offset and there is an unscaled op to use. Optional UnscaledOp = AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); if (useUnscaledOp && !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff)) llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); int64_t Remainder = Offset % Scale; assert(!(Remainder && useUnscaledOp) && "Cannot have remainder when using unscaled op"); assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); int NewOffset = Offset / Scale; if (MinOff <= NewOffset && NewOffset <= MaxOff) Offset = Remainder; else { NewOffset = NewOffset < 0 ? MinOff : MaxOff; Offset = Offset - NewOffset * Scale + Remainder; } if (EmittableOffset) *EmittableOffset = NewOffset; if (OutUseUnscaledOp) *OutUseUnscaledOp = useUnscaledOp; if (OutUnscaledOp && UnscaledOp) *OutUnscaledOp = *UnscaledOp; return AArch64FrameOffsetCanUpdate | (Offset == 0 ? AArch64FrameOffsetIsLegal : 0); } bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, const AArch64InstrInfo *TII) { unsigned Opcode = MI.getOpcode(); unsigned ImmIdx = FrameRegIdx + 1; if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { Offset += MI.getOperand(ImmIdx).getImm(); emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), MI.getOperand(0).getReg(), FrameReg, Offset, TII, MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); MI.eraseFromParent(); Offset = 0; return true; } int NewOffset; unsigned UnscaledOp; bool UseUnscaledOp; int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, &UnscaledOp, &NewOffset); if (Status & AArch64FrameOffsetCanUpdate) { if (Status & AArch64FrameOffsetIsLegal) // Replace the FrameIndex with FrameReg. MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); if (UseUnscaledOp) MI.setDesc(TII->get(UnscaledOp)); MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); return Offset == 0; } return false; } void AArch64InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(AArch64::HINT); NopInst.addOperand(MCOperand::createImm(0)); } // AArch64 supports MachineCombiner. bool AArch64InstrInfo::useMachineCombiner() const { return true; } // True when Opc sets flag static bool isCombineInstrSettingFlag(unsigned Opc) { switch (Opc) { case AArch64::ADDSWrr: case AArch64::ADDSWri: case AArch64::ADDSXrr: case AArch64::ADDSXri: case AArch64::SUBSWrr: case AArch64::SUBSXrr: // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. case AArch64::SUBSWri: case AArch64::SUBSXri: return true; default: break; } return false; } // 32b Opcodes that can be combined with a MUL static bool isCombineInstrCandidate32(unsigned Opc) { switch (Opc) { case AArch64::ADDWrr: case AArch64::ADDWri: case AArch64::SUBWrr: case AArch64::ADDSWrr: case AArch64::ADDSWri: case AArch64::SUBSWrr: // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. case AArch64::SUBWri: case AArch64::SUBSWri: return true; default: break; } return false; } // 64b Opcodes that can be combined with a MUL static bool isCombineInstrCandidate64(unsigned Opc) { switch (Opc) { case AArch64::ADDXrr: case AArch64::ADDXri: case AArch64::SUBXrr: case AArch64::ADDSXrr: case AArch64::ADDSXri: case AArch64::SUBSXrr: // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. case AArch64::SUBXri: case AArch64::SUBSXri: return true; default: break; } return false; } // FP Opcodes that can be combined with a FMUL static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { switch (Inst.getOpcode()) { default: break; case AArch64::FADDSrr: case AArch64::FADDDrr: case AArch64::FADDv2f32: case AArch64::FADDv2f64: case AArch64::FADDv4f32: case AArch64::FSUBSrr: case AArch64::FSUBDrr: case AArch64::FSUBv2f32: case AArch64::FSUBv2f64: case AArch64::FSUBv4f32: TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; return (Options.UnsafeFPMath || Options.AllowFPOpFusion == FPOpFusion::Fast); } return false; } // Opcodes that can be combined with a MUL static bool isCombineInstrCandidate(unsigned Opc) { return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); } // // Utility routine that checks if \param MO is defined by an // \param CombineOpc instruction in the basic block \param MBB static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc, unsigned ZeroReg = 0, bool CheckZeroReg = false) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineInstr *MI = nullptr; if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) return false; // Must only used by the user we combine with. if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) return false; if (CheckZeroReg) { assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); // The third input reg must be zero. if (MI->getOperand(3).getReg() != ZeroReg) return false; } return true; } // // Is \param MO defined by an integer multiply and can be combined? static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg) { return canCombine(MBB, MO, MulOpc, ZeroReg, true); } // // Is \param MO defined by a floating-point multiply and can be combined? static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc) { return canCombine(MBB, MO, MulOpc); } // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) // 3. Other forms of the same operation (intrinsics and other variants) bool AArch64InstrInfo::isAssociativeAndCommutative( const MachineInstr &Inst) const { switch (Inst.getOpcode()) { case AArch64::FADDDrr: case AArch64::FADDSrr: case AArch64::FADDv2f32: case AArch64::FADDv2f64: case AArch64::FADDv4f32: case AArch64::FMULDrr: case AArch64::FMULSrr: case AArch64::FMULX32: case AArch64::FMULX64: case AArch64::FMULXv2f32: case AArch64::FMULXv2f64: case AArch64::FMULXv4f32: case AArch64::FMULv2f32: case AArch64::FMULv2f64: case AArch64::FMULv4f32: return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; default: return false; } } /// Find instructions that can be turned into madd. static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl &Patterns) { unsigned Opc = Root.getOpcode(); MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; if (!isCombineInstrCandidate(Opc)) return false; if (isCombineInstrSettingFlag(Opc)) { int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); // When NZCV is live bail out. if (Cmp_NZCV == -1) return false; unsigned NewOpc = convertToNonFlagSettingOpc(Root); // When opcode can't change bail out. // CHECKME: do we miss any cases for opcode conversion? if (NewOpc == Opc) return false; Opc = NewOpc; } switch (Opc) { default: break; case AArch64::ADDWrr: assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && "ADDWrr does not have register operands"); if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { Patterns.push_back(MachineCombinerPattern::MULADDW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { Patterns.push_back(MachineCombinerPattern::MULADDW_OP2); Found = true; } break; case AArch64::ADDXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { Patterns.push_back(MachineCombinerPattern::MULADDX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { Patterns.push_back(MachineCombinerPattern::MULADDX_OP2); Found = true; } break; case AArch64::SUBWrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2); Found = true; } break; case AArch64::SUBXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2); Found = true; } break; case AArch64::ADDWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1); Found = true; } break; case AArch64::ADDXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1); Found = true; } break; case AArch64::SUBWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1); Found = true; } break; case AArch64::SUBXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1); Found = true; } break; } return Found; } /// Floating-Point Support /// Find instructions that can be turned into madd. static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl &Patterns) { if (!isCombineInstrCandidateFP(Root)) return false; MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; switch (Root.getOpcode()) { default: assert(false && "Unsupported FP instruction in combiner\n"); break; case AArch64::FADDSrr: assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && "FADDWrr does not have register operands"); if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv1i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv1i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2); Found = true; } break; case AArch64::FADDDrr: if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv1i64_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv1i64_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2); Found = true; } break; case AArch64::FADDv2f32: if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv2i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv2f32)) { Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2f32)) { Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2); Found = true; } break; case AArch64::FADDv2f64: if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv2i64_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv2f64)) { Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2i64_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2f64)) { Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2); Found = true; } break; case AArch64::FADDv4f32: if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv4i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv4f32)) { Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv4i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv4f32)) { Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2); Found = true; } break; case AArch64::FSUBSrr: if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv1i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) { Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1); Found = true; } break; case AArch64::FSUBDrr: if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv1i64_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) { Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1); Found = true; } break; case AArch64::FSUBv2f32: if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2f32)) { Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv2i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv2f32)) { Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1); Found = true; } break; case AArch64::FSUBv2f64: if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2i64_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2f64)) { Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv2i64_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv2f64)) { Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1); Found = true; } break; case AArch64::FSUBv4f32: if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv4i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv4f32)) { Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2); Found = true; } if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv4i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1); Found = true; } else if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULv4f32)) { Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1); Found = true; } break; } return Found; } /// Return true when a code sequence can improve throughput. It /// should be called only for instructions in loops. /// \param Pattern - combiner pattern bool AArch64InstrInfo::isThroughputPattern( MachineCombinerPattern Pattern) const { switch (Pattern) { default: break; case MachineCombinerPattern::FMULADDS_OP1: case MachineCombinerPattern::FMULADDS_OP2: case MachineCombinerPattern::FMULSUBS_OP1: case MachineCombinerPattern::FMULSUBS_OP2: case MachineCombinerPattern::FMULADDD_OP1: case MachineCombinerPattern::FMULADDD_OP2: case MachineCombinerPattern::FMULSUBD_OP1: case MachineCombinerPattern::FMULSUBD_OP2: case MachineCombinerPattern::FNMULSUBS_OP1: case MachineCombinerPattern::FNMULSUBD_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP2: case MachineCombinerPattern::FMLAv1i64_indexed_OP1: case MachineCombinerPattern::FMLAv1i64_indexed_OP2: case MachineCombinerPattern::FMLAv2f32_OP2: case MachineCombinerPattern::FMLAv2f32_OP1: case MachineCombinerPattern::FMLAv2f64_OP1: case MachineCombinerPattern::FMLAv2f64_OP2: case MachineCombinerPattern::FMLAv2i32_indexed_OP1: case MachineCombinerPattern::FMLAv2i32_indexed_OP2: case MachineCombinerPattern::FMLAv2i64_indexed_OP1: case MachineCombinerPattern::FMLAv2i64_indexed_OP2: case MachineCombinerPattern::FMLAv4f32_OP1: case MachineCombinerPattern::FMLAv4f32_OP2: case MachineCombinerPattern::FMLAv4i32_indexed_OP1: case MachineCombinerPattern::FMLAv4i32_indexed_OP2: case MachineCombinerPattern::FMLSv1i32_indexed_OP2: case MachineCombinerPattern::FMLSv1i64_indexed_OP2: case MachineCombinerPattern::FMLSv2i32_indexed_OP2: case MachineCombinerPattern::FMLSv2i64_indexed_OP2: case MachineCombinerPattern::FMLSv2f32_OP2: case MachineCombinerPattern::FMLSv2f64_OP2: case MachineCombinerPattern::FMLSv4i32_indexed_OP2: case MachineCombinerPattern::FMLSv4f32_OP2: return true; } // end switch (Pattern) return false; } /// Return true when there is potentially a faster code sequence for an /// instruction chain ending in \p Root. All potential patterns are listed in /// the \p Pattern vector. Pattern should be sorted in priority order since the /// pattern evaluator stops checking as soon as it finds a faster sequence. bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, SmallVectorImpl &Patterns) const { // Integer patterns if (getMaddPatterns(Root, Patterns)) return true; // Floating point patterns if (getFMAPatterns(Root, Patterns)) return true; return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); } enum class FMAInstKind { Default, Indexed, Accumulator }; /// genFusedMultiply - Generate fused multiply instructions. /// This function supports both integer and floating point instructions. /// A typical example: /// F|MUL I=A,B,0 /// F|ADD R,I,C /// ==> F|MADD R,A,B,C /// \param MF Containing MachineFunction /// \param MRI Register information /// \param TII Target information /// \param Root is the F|ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction /// \param IdxMulOpd is index of operand in Root that is the result of /// the F|MUL. In the example above IdxMulOpd is 1. /// \param MaddOpc the opcode fo the f|madd instruction /// \param RC Register class of operands /// \param kind of fma instruction (addressing mode) to be generated /// \param ReplacedAddend is the result register from the instruction /// replacing the non-combined operand, if any. static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind = FMAInstKind::Default, const unsigned *ReplacedAddend = nullptr) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); unsigned ResultReg = Root.getOperand(0).getReg(); unsigned SrcReg0 = MUL->getOperand(1).getReg(); bool Src0IsKill = MUL->getOperand(1).isKill(); unsigned SrcReg1 = MUL->getOperand(2).getReg(); bool Src1IsKill = MUL->getOperand(2).isKill(); unsigned SrcReg2; bool Src2IsKill; if (ReplacedAddend) { // If we just generated a new addend, we must be it's only use. SrcReg2 = *ReplacedAddend; Src2IsKill = true; } else { SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); } if (TargetRegisterInfo::isVirtualRegister(ResultReg)) MRI.constrainRegClass(ResultReg, RC); if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) MRI.constrainRegClass(SrcReg0, RC); if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) MRI.constrainRegClass(SrcReg1, RC); if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) MRI.constrainRegClass(SrcReg2, RC); MachineInstrBuilder MIB; if (kind == FMAInstKind::Default) MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) .addReg(SrcReg0, getKillRegState(Src0IsKill)) .addReg(SrcReg1, getKillRegState(Src1IsKill)) .addReg(SrcReg2, getKillRegState(Src2IsKill)); else if (kind == FMAInstKind::Indexed) MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) .addReg(SrcReg2, getKillRegState(Src2IsKill)) .addReg(SrcReg0, getKillRegState(Src0IsKill)) .addReg(SrcReg1, getKillRegState(Src1IsKill)) .addImm(MUL->getOperand(3).getImm()); else if (kind == FMAInstKind::Accumulator) MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) .addReg(SrcReg2, getKillRegState(Src2IsKill)) .addReg(SrcReg0, getKillRegState(Src0IsKill)) .addReg(SrcReg1, getKillRegState(Src1IsKill)); else assert(false && "Invalid FMA instruction kind \n"); // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) InsInstrs.push_back(MIB); return MUL; } /// genMaddR - Generate madd instruction and combine mul and add using /// an extra virtual register /// Example - an ADD intermediate needs to be stored in a register: /// MUL I=A,B,0 /// ADD R,I,Imm /// ==> ORR V, ZR, Imm /// ==> MADD R,A,B,V /// \param MF Containing MachineFunction /// \param MRI Register information /// \param TII Target information /// \param Root is the ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction /// \param IdxMulOpd is index of operand in Root that is the result of /// the MUL. In the example above IdxMulOpd is 1. /// \param MaddOpc the opcode fo the madd instruction /// \param VR is a virtual register that holds the value of an ADD operand /// (V in the example above). /// \param RC Register class of operands static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); unsigned ResultReg = Root.getOperand(0).getReg(); unsigned SrcReg0 = MUL->getOperand(1).getReg(); bool Src0IsKill = MUL->getOperand(1).isKill(); unsigned SrcReg1 = MUL->getOperand(2).getReg(); bool Src1IsKill = MUL->getOperand(2).isKill(); if (TargetRegisterInfo::isVirtualRegister(ResultReg)) MRI.constrainRegClass(ResultReg, RC); if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) MRI.constrainRegClass(SrcReg0, RC); if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) MRI.constrainRegClass(SrcReg1, RC); if (TargetRegisterInfo::isVirtualRegister(VR)) MRI.constrainRegClass(VR, RC); MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) .addReg(SrcReg0, getKillRegState(Src0IsKill)) .addReg(SrcReg1, getKillRegState(Src1IsKill)) .addReg(VR); // Insert the MADD InsInstrs.push_back(MIB); return MUL; } /// When getMachineCombinerPatterns() finds potential patterns, /// this function generates the instructions that could replace the /// original code sequence void AArch64InstrInfo::genAlternativeCodeSequence( MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl &InsInstrs, SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg) const { MachineBasicBlock &MBB = *Root.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); MachineInstr *MUL; const TargetRegisterClass *RC; unsigned Opc; switch (Pattern) { default: // Reassociate instructions. TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg); return; case MachineCombinerPattern::MULADDW_OP1: case MachineCombinerPattern::MULADDX_OP1: // MUL I=A,B,0 // ADD R,I,C // ==> MADD R,A,B,C // --- Create(MADD); if (Pattern == MachineCombinerPattern::MULADDW_OP1) { Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; case MachineCombinerPattern::MULADDW_OP2: case MachineCombinerPattern::MULADDX_OP2: // MUL I=A,B,0 // ADD R,C,I // ==> MADD R,A,B,C // --- Create(MADD); if (Pattern == MachineCombinerPattern::MULADDW_OP2) { Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULADDWI_OP1: case MachineCombinerPattern::MULADDXI_OP1: { // MUL I=A,B,0 // ADD R,I,Imm // ==> ORR V, ZR, Imm // ==> MADD R,A,B,V // --- Create(MADD); const TargetRegisterClass *OrrRC; unsigned BitSize, OrrOpc, ZeroReg; if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { OrrOpc = AArch64::ORRWri; OrrRC = &AArch64::GPR32spRegClass; BitSize = 32; ZeroReg = AArch64::WZR; Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { OrrOpc = AArch64::ORRXri; OrrRC = &AArch64::GPR64spRegClass; BitSize = 64; ZeroReg = AArch64::XZR; Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } unsigned NewVR = MRI.createVirtualRegister(OrrRC); uint64_t Imm = Root.getOperand(2).getImm(); if (Root.getOperand(3).isImm()) { unsigned Val = Root.getOperand(3).getImm(); Imm = Imm << Val; } uint64_t UImm = SignExtend64(Imm, BitSize); uint64_t Encoding; if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) .addReg(ZeroReg) .addImm(Encoding); InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); } break; } case MachineCombinerPattern::MULSUBW_OP1: case MachineCombinerPattern::MULSUBX_OP1: { // MUL I=A,B,0 // SUB R,I, C // ==> SUB V, 0, C // ==> MADD R,A,B,V // = -C + A*B // --- Create(MADD); const TargetRegisterClass *SubRC; unsigned SubOpc, ZeroReg; if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { SubOpc = AArch64::SUBWrr; SubRC = &AArch64::GPR32spRegClass; ZeroReg = AArch64::WZR; Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { SubOpc = AArch64::SUBXrr; SubRC = &AArch64::GPR64spRegClass; ZeroReg = AArch64::XZR; Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } unsigned NewVR = MRI.createVirtualRegister(SubRC); // SUB NewVR, 0, C MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) .addReg(ZeroReg) .add(Root.getOperand(2)); InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); break; } case MachineCombinerPattern::MULSUBW_OP2: case MachineCombinerPattern::MULSUBX_OP2: // MUL I=A,B,0 // SUB R,C,I // ==> MSUB R,A,B,C (computes C - A*B) // --- Create(MSUB); if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { Opc = AArch64::MSUBWrrr; RC = &AArch64::GPR32RegClass; } else { Opc = AArch64::MSUBXrrr; RC = &AArch64::GPR64RegClass; } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULSUBWI_OP1: case MachineCombinerPattern::MULSUBXI_OP1: { // MUL I=A,B,0 // SUB R,I, Imm // ==> ORR V, ZR, -Imm // ==> MADD R,A,B,V // = -Imm + A*B // --- Create(MADD); const TargetRegisterClass *OrrRC; unsigned BitSize, OrrOpc, ZeroReg; if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { OrrOpc = AArch64::ORRWri; OrrRC = &AArch64::GPR32spRegClass; BitSize = 32; ZeroReg = AArch64::WZR; Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { OrrOpc = AArch64::ORRXri; OrrRC = &AArch64::GPR64spRegClass; BitSize = 64; ZeroReg = AArch64::XZR; Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } unsigned NewVR = MRI.createVirtualRegister(OrrRC); uint64_t Imm = Root.getOperand(2).getImm(); if (Root.getOperand(3).isImm()) { unsigned Val = Root.getOperand(3).getImm(); Imm = Imm << Val; } uint64_t UImm = SignExtend64(-Imm, BitSize); uint64_t Encoding; if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) .addReg(ZeroReg) .addImm(Encoding); InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); } break; } // Floating Point Support case MachineCombinerPattern::FMULADDS_OP1: case MachineCombinerPattern::FMULADDD_OP1: // MUL I=A,B,0 // ADD R,I,C // ==> MADD R,A,B,C // --- Create(MADD); if (Pattern == MachineCombinerPattern::FMULADDS_OP1) { Opc = AArch64::FMADDSrrr; RC = &AArch64::FPR32RegClass; } else { Opc = AArch64::FMADDDrrr; RC = &AArch64::FPR64RegClass; } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; case MachineCombinerPattern::FMULADDS_OP2: case MachineCombinerPattern::FMULADDD_OP2: // FMUL I=A,B,0 // FADD R,C,I // ==> FMADD R,A,B,C // --- Create(FMADD); if (Pattern == MachineCombinerPattern::FMULADDS_OP2) { Opc = AArch64::FMADDSrrr; RC = &AArch64::FPR32RegClass; } else { Opc = AArch64::FMADDDrrr; RC = &AArch64::FPR64RegClass; } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::FMLAv1i32_indexed_OP1: Opc = AArch64::FMLAv1i32_indexed; RC = &AArch64::FPR32RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Indexed); break; case MachineCombinerPattern::FMLAv1i32_indexed_OP2: Opc = AArch64::FMLAv1i32_indexed; RC = &AArch64::FPR32RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); break; case MachineCombinerPattern::FMLAv1i64_indexed_OP1: Opc = AArch64::FMLAv1i64_indexed; RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Indexed); break; case MachineCombinerPattern::FMLAv1i64_indexed_OP2: Opc = AArch64::FMLAv1i64_indexed; RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); break; case MachineCombinerPattern::FMLAv2i32_indexed_OP1: case MachineCombinerPattern::FMLAv2f32_OP1: RC = &AArch64::FPR64RegClass; if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { Opc = AArch64::FMLAv2i32_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Indexed); } else { Opc = AArch64::FMLAv2f32; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Accumulator); } break; case MachineCombinerPattern::FMLAv2i32_indexed_OP2: case MachineCombinerPattern::FMLAv2f32_OP2: RC = &AArch64::FPR64RegClass; if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { Opc = AArch64::FMLAv2i32_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); } else { Opc = AArch64::FMLAv2f32; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Accumulator); } break; case MachineCombinerPattern::FMLAv2i64_indexed_OP1: case MachineCombinerPattern::FMLAv2f64_OP1: RC = &AArch64::FPR128RegClass; if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { Opc = AArch64::FMLAv2i64_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Indexed); } else { Opc = AArch64::FMLAv2f64; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Accumulator); } break; case MachineCombinerPattern::FMLAv2i64_indexed_OP2: case MachineCombinerPattern::FMLAv2f64_OP2: RC = &AArch64::FPR128RegClass; if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { Opc = AArch64::FMLAv2i64_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); } else { Opc = AArch64::FMLAv2f64; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Accumulator); } break; case MachineCombinerPattern::FMLAv4i32_indexed_OP1: case MachineCombinerPattern::FMLAv4f32_OP1: RC = &AArch64::FPR128RegClass; if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { Opc = AArch64::FMLAv4i32_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Indexed); } else { Opc = AArch64::FMLAv4f32; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Accumulator); } break; case MachineCombinerPattern::FMLAv4i32_indexed_OP2: case MachineCombinerPattern::FMLAv4f32_OP2: RC = &AArch64::FPR128RegClass; if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { Opc = AArch64::FMLAv4i32_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); } else { Opc = AArch64::FMLAv4f32; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Accumulator); } break; case MachineCombinerPattern::FMULSUBS_OP1: case MachineCombinerPattern::FMULSUBD_OP1: { // FMUL I=A,B,0 // FSUB R,I,C // ==> FNMSUB R,A,B,C // = -C + A*B // --- Create(FNMSUB); if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) { Opc = AArch64::FNMSUBSrrr; RC = &AArch64::FPR32RegClass; } else { Opc = AArch64::FNMSUBDrrr; RC = &AArch64::FPR64RegClass; } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; } case MachineCombinerPattern::FNMULSUBS_OP1: case MachineCombinerPattern::FNMULSUBD_OP1: { // FNMUL I=A,B,0 // FSUB R,I,C // ==> FNMADD R,A,B,C // = -A*B - C // --- Create(FNMADD); if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) { Opc = AArch64::FNMADDSrrr; RC = &AArch64::FPR32RegClass; } else { Opc = AArch64::FNMADDDrrr; RC = &AArch64::FPR64RegClass; } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; } case MachineCombinerPattern::FMULSUBS_OP2: case MachineCombinerPattern::FMULSUBD_OP2: { // FMUL I=A,B,0 // FSUB R,C,I // ==> FMSUB R,A,B,C (computes C - A*B) // --- Create(FMSUB); if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) { Opc = AArch64::FMSUBSrrr; RC = &AArch64::FPR32RegClass; } else { Opc = AArch64::FMSUBDrrr; RC = &AArch64::FPR64RegClass; } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; } case MachineCombinerPattern::FMLSv1i32_indexed_OP2: Opc = AArch64::FMLSv1i32_indexed; RC = &AArch64::FPR32RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); break; case MachineCombinerPattern::FMLSv1i64_indexed_OP2: Opc = AArch64::FMLSv1i64_indexed; RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); break; case MachineCombinerPattern::FMLSv2f32_OP2: case MachineCombinerPattern::FMLSv2i32_indexed_OP2: RC = &AArch64::FPR64RegClass; if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { Opc = AArch64::FMLSv2i32_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); } else { Opc = AArch64::FMLSv2f32; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Accumulator); } break; case MachineCombinerPattern::FMLSv2f64_OP2: case MachineCombinerPattern::FMLSv2i64_indexed_OP2: RC = &AArch64::FPR128RegClass; if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { Opc = AArch64::FMLSv2i64_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); } else { Opc = AArch64::FMLSv2f64; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Accumulator); } break; case MachineCombinerPattern::FMLSv4f32_OP2: case MachineCombinerPattern::FMLSv4i32_indexed_OP2: RC = &AArch64::FPR128RegClass; if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { Opc = AArch64::FMLSv4i32_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Indexed); } else { Opc = AArch64::FMLSv4f32; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, FMAInstKind::Accumulator); } break; case MachineCombinerPattern::FMLSv2f32_OP1: case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { RC = &AArch64::FPR64RegClass; unsigned NewVR = MRI.createVirtualRegister(RC); MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) .add(Root.getOperand(2)); InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { Opc = AArch64::FMLAv2i32_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Indexed, &NewVR); } else { Opc = AArch64::FMLAv2f32; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Accumulator, &NewVR); } break; } case MachineCombinerPattern::FMLSv4f32_OP1: case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { RC = &AArch64::FPR128RegClass; unsigned NewVR = MRI.createVirtualRegister(RC); MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) .add(Root.getOperand(2)); InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { Opc = AArch64::FMLAv4i32_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Indexed, &NewVR); } else { Opc = AArch64::FMLAv4f32; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Accumulator, &NewVR); } break; } case MachineCombinerPattern::FMLSv2f64_OP1: case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { RC = &AArch64::FPR128RegClass; unsigned NewVR = MRI.createVirtualRegister(RC); MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) .add(Root.getOperand(2)); InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { Opc = AArch64::FMLAv2i64_indexed; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Indexed, &NewVR); } else { Opc = AArch64::FMLAv2f64; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, FMAInstKind::Accumulator, &NewVR); } break; } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); DelInstrs.push_back(&Root); } /// Replace csincr-branch sequence by simple conditional branch /// /// Examples: /// 1. \code /// csinc w9, wzr, wzr, /// tbnz w9, #0, 0x44 /// \endcode /// to /// \code /// b. /// \endcode /// /// 2. \code /// csinc w9, wzr, wzr, /// tbz w9, #0, 0x44 /// \endcode /// to /// \code /// b. /// \endcode /// /// Replace compare and branch sequence by TBZ/TBNZ instruction when the /// compare's constant operand is power of 2. /// /// Examples: /// \code /// and w8, w8, #0x400 /// cbnz w8, L1 /// \endcode /// to /// \code /// tbnz w8, #10, L1 /// \endcode /// /// \param MI Conditional Branch /// \return True when the simple conditional branch is generated /// bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { bool IsNegativeBranch = false; bool IsTestAndBranch = false; unsigned TargetBBInMI = 0; switch (MI.getOpcode()) { default: llvm_unreachable("Unknown branch instruction?"); case AArch64::Bcc: return false; case AArch64::CBZW: case AArch64::CBZX: TargetBBInMI = 1; break; case AArch64::CBNZW: case AArch64::CBNZX: TargetBBInMI = 1; IsNegativeBranch = true; break; case AArch64::TBZW: case AArch64::TBZX: TargetBBInMI = 2; IsTestAndBranch = true; break; case AArch64::TBNZW: case AArch64::TBNZX: TargetBBInMI = 2; IsNegativeBranch = true; IsTestAndBranch = true; break; } // So we increment a zero register and test for bits other // than bit 0? Conservatively bail out in case the verifier // missed this case. if (IsTestAndBranch && MI.getOperand(1).getImm()) return false; // Find Definition. assert(MI.getParent() && "Incomplete machine instruciton\n"); MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); unsigned VReg = MI.getOperand(0).getReg(); if (!TargetRegisterInfo::isVirtualRegister(VReg)) return false; MachineInstr *DefMI = MRI->getVRegDef(VReg); // Look through COPY instructions to find definition. while (DefMI->isCopy()) { unsigned CopyVReg = DefMI->getOperand(1).getReg(); if (!MRI->hasOneNonDBGUse(CopyVReg)) return false; if (!MRI->hasOneDef(CopyVReg)) return false; DefMI = MRI->getVRegDef(CopyVReg); } switch (DefMI->getOpcode()) { default: return false; // Fold AND into a TBZ/TBNZ if constant operand is power of 2. case AArch64::ANDWri: case AArch64::ANDXri: { if (IsTestAndBranch) return false; if (DefMI->getParent() != MBB) return false; if (!MRI->hasOneNonDBGUse(VReg)) return false; bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); uint64_t Mask = AArch64_AM::decodeLogicalImmediate( DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); if (!isPowerOf2_64(Mask)) return false; MachineOperand &MO = DefMI->getOperand(1); unsigned NewReg = MO.getReg(); if (!TargetRegisterInfo::isVirtualRegister(NewReg)) return false; assert(!MRI->def_empty(NewReg) && "Register must be defined."); MachineBasicBlock &RefToMBB = *MBB; MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); DebugLoc DL = MI.getDebugLoc(); unsigned Imm = Log2_64(Mask); unsigned Opc = (Imm < 32) ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) .addReg(NewReg) .addImm(Imm) .addMBB(TBB); // Register lives on to the CBZ now. MO.setIsKill(false); // For immediate smaller than 32, we need to use the 32-bit // variant (W) in all cases. Indeed the 64-bit variant does not // allow to encode them. // Therefore, if the input register is 64-bit, we need to take the // 32-bit sub-part. if (!Is32Bit && Imm < 32) NewMI->getOperand(0).setSubReg(AArch64::sub_32); MI.eraseFromParent(); return true; } // Look for CSINC case AArch64::CSINCWr: case AArch64::CSINCXr: { if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && DefMI->getOperand(2).getReg() == AArch64::WZR) && !(DefMI->getOperand(1).getReg() == AArch64::XZR && DefMI->getOperand(2).getReg() == AArch64::XZR)) return false; if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) return false; AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); // Convert only when the condition code is not modified between // the CSINC and the branch. The CC may be used by other // instructions in between. if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) return false; MachineBasicBlock &RefToMBB = *MBB; MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); DebugLoc DL = MI.getDebugLoc(); if (IsNegativeBranch) CC = AArch64CC::getInvertedCondCode(CC); BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); MI.eraseFromParent(); return true; } } } std::pair AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { const unsigned Mask = AArch64II::MO_FRAGMENT; return std::make_pair(TF & Mask, TF & ~Mask); } ArrayRef> AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { using namespace AArch64II; static const std::pair TargetFlags[] = { {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, {MO_HI12, "aarch64-hi12"}}; return makeArrayRef(TargetFlags); } ArrayRef> AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { using namespace AArch64II; static const std::pair TargetFlags[] = { {MO_COFFSTUB, "aarch64-coffstub"}, {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"}, {MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"}, {MO_DLLIMPORT, "aarch64-dllimport"}}; return makeArrayRef(TargetFlags); } ArrayRef> AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { static const std::pair TargetFlags[] = {{MOSuppressPair, "aarch64-suppress-pair"}, {MOStridedAccess, "aarch64-strided-access"}}; return makeArrayRef(TargetFlags); } /// Constants defining how certain sequences should be outlined. /// This encompasses how an outlined function should be called, and what kind of /// frame should be emitted for that outlined function. /// /// \p MachineOutlinerDefault implies that the function should be called with /// a save and restore of LR to the stack. /// /// That is, /// /// I1 Save LR OUTLINED_FUNCTION: /// I2 --> BL OUTLINED_FUNCTION I1 /// I3 Restore LR I2 /// I3 /// RET /// /// * Call construction overhead: 3 (save + BL + restore) /// * Frame construction overhead: 1 (ret) /// * Requires stack fixups? Yes /// /// \p MachineOutlinerTailCall implies that the function is being created from /// a sequence of instructions ending in a return. /// /// That is, /// /// I1 OUTLINED_FUNCTION: /// I2 --> B OUTLINED_FUNCTION I1 /// RET I2 /// RET /// /// * Call construction overhead: 1 (B) /// * Frame construction overhead: 0 (Return included in sequence) /// * Requires stack fixups? No /// /// \p MachineOutlinerNoLRSave implies that the function should be called using /// a BL instruction, but doesn't require LR to be saved and restored. This /// happens when LR is known to be dead. /// /// That is, /// /// I1 OUTLINED_FUNCTION: /// I2 --> BL OUTLINED_FUNCTION I1 /// I3 I2 /// I3 /// RET /// /// * Call construction overhead: 1 (BL) /// * Frame construction overhead: 1 (RET) /// * Requires stack fixups? No /// /// \p MachineOutlinerThunk implies that the function is being created from /// a sequence of instructions ending in a call. The outlined function is /// called with a BL instruction, and the outlined function tail-calls the /// original call destination. /// /// That is, /// /// I1 OUTLINED_FUNCTION: /// I2 --> BL OUTLINED_FUNCTION I1 /// BL f I2 /// B f /// * Call construction overhead: 1 (BL) /// * Frame construction overhead: 0 /// * Requires stack fixups? No /// /// \p MachineOutlinerRegSave implies that the function should be called with a /// save and restore of LR to an available register. This allows us to avoid /// stack fixups. Note that this outlining variant is compatible with the /// NoLRSave case. /// /// That is, /// /// I1 Save LR OUTLINED_FUNCTION: /// I2 --> BL OUTLINED_FUNCTION I1 /// I3 Restore LR I2 /// I3 /// RET /// /// * Call construction overhead: 3 (save + BL + restore) /// * Frame construction overhead: 1 (ret) /// * Requires stack fixups? No enum MachineOutlinerClass { MachineOutlinerDefault, /// Emit a save, restore, call, and return. MachineOutlinerTailCall, /// Only emit a branch. MachineOutlinerNoLRSave, /// Emit a call and return. MachineOutlinerThunk, /// Emit a call and tail-call. MachineOutlinerRegSave /// Same as default, but save to a register. }; enum MachineOutlinerMBBFlags { LRUnavailableSomewhere = 0x2, HasCalls = 0x4, UnsafeRegsDead = 0x8 }; unsigned AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { assert(C.LRUWasSet && "LRU wasn't set?"); MachineFunction *MF = C.getMF(); const AArch64RegisterInfo *ARI = static_cast( MF->getSubtarget().getRegisterInfo()); // Check if there is an available register across the sequence that we can // use. for (unsigned Reg : AArch64::GPR64RegClass) { if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && // LR is not reserved, but don't use it. Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. Reg != AArch64::X17 && // Ditto for X17. C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) return Reg; } // No suitable register. Return 0. return 0u; } outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( std::vector &RepeatedSequenceLocs) const { outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; unsigned SequenceSize = std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, [this](unsigned Sum, const MachineInstr &MI) { return Sum + getInstSizeInBytes(MI); }); // Properties about candidate MBBs that hold for all of them. unsigned FlagsSetInAll = 0xF; // Compute liveness information for each candidate, and set FlagsSetInAll. const TargetRegisterInfo &TRI = getRegisterInfo(); std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), [&FlagsSetInAll](outliner::Candidate &C) { FlagsSetInAll &= C.Flags; }); // According to the AArch64 Procedure Call Standard, the following are // undefined on entry/exit from a function call: // // * Registers x16, x17, (and thus w16, w17) // * Condition codes (and thus the NZCV register) // // Because if this, we can't outline any sequence of instructions where // one // of these registers is live into/across it. Thus, we need to delete // those // candidates. auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { // If the unsafe registers in this block are all dead, then we don't need // to compute liveness here. if (C.Flags & UnsafeRegsDead) return false; C.initLRU(TRI); LiveRegUnits LRU = C.LRU; return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || !LRU.available(AArch64::NZCV)); }; // Are there any candidates where those registers are live? if (!(FlagsSetInAll & UnsafeRegsDead)) { // Erase every candidate that violates the restrictions above. (It could be // true that we have viable candidates, so it's not worth bailing out in // the case that, say, 1 out of 20 candidates violate the restructions.) RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), CantGuaranteeValueAcrossCall), RepeatedSequenceLocs.end()); // If the sequence doesn't have enough candidates left, then we're done. if (RepeatedSequenceLocs.size() < 2) return outliner::OutlinedFunction(); } // At this point, we have only "safe" candidates to outline. Figure out // frame + call instruction information. unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); // Helper lambda which sets call information for every candidate. auto SetCandidateCallInfo = [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { for (outliner::Candidate &C : RepeatedSequenceLocs) C.setCallInfo(CallID, NumBytesForCall); }; unsigned FrameID = MachineOutlinerDefault; unsigned NumBytesToCreateFrame = 4; bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement"); }); // Returns true if an instructions is safe to fix up, false otherwise. auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { if (MI.isCall()) return true; if (!MI.modifiesRegister(AArch64::SP, &TRI) && !MI.readsRegister(AArch64::SP, &TRI)) return true; // Any modification of SP will break our code to save/restore LR. // FIXME: We could handle some instructions which add a constant // offset to SP, with a bit more work. if (MI.modifiesRegister(AArch64::SP, &TRI)) return false; // At this point, we have a stack instruction that we might need to // fix up. We'll handle it if it's a load or store. if (MI.mayLoadOrStore()) { const MachineOperand *Base; // Filled with the base operand of MI. int64_t Offset; // Filled with the offset of MI. // Does it allow us to offset the base operand and is the base the // register SP? if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() || Base->getReg() != AArch64::SP) return false; // Find the minimum/maximum offset for this instruction and check // if fixing it up would be in range. int64_t MinOffset, MaxOffset; // Unscaled offsets for the instruction. unsigned Scale; // The scale to multiply the offsets by. unsigned DummyWidth; getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); Offset += 16; // Update the offset to what it would be if we outlined. if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale) return false; // It's in range, so we can outline it. return true; } // FIXME: Add handling for instructions like "add x0, sp, #8". // We can't fix it up, so don't outline it. return false; }; // True if it's possible to fix up each stack instruction in this sequence. // Important for frames/call variants that modify the stack. bool AllStackInstrsSafe = std::all_of( FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); // If the last instruction in any candidate is a terminator, then we should // tail call all of the candidates. if (RepeatedSequenceLocs[0].back()->isTerminator()) { FrameID = MachineOutlinerTailCall; NumBytesToCreateFrame = 0; SetCandidateCallInfo(MachineOutlinerTailCall, 4); } else if (LastInstrOpcode == AArch64::BL || (LastInstrOpcode == AArch64::BLR && !HasBTI)) { // FIXME: Do we need to check if the code after this uses the value of LR? FrameID = MachineOutlinerThunk; NumBytesToCreateFrame = 0; SetCandidateCallInfo(MachineOutlinerThunk, 4); } else { // We need to decide how to emit calls + frames. We can always emit the same // frame if we don't need to save to the stack. If we have to save to the // stack, then we need a different frame. unsigned NumBytesNoStackCalls = 0; std::vector CandidatesWithoutStackFixups; for (outliner::Candidate &C : RepeatedSequenceLocs) { C.initLRU(TRI); // Is LR available? If so, we don't need a save. if (C.LRU.available(AArch64::LR)) { NumBytesNoStackCalls += 4; C.setCallInfo(MachineOutlinerNoLRSave, 4); CandidatesWithoutStackFixups.push_back(C); } // Is an unused register available? If so, we won't modify the stack, so // we can outline with the same frame type as those that don't save LR. else if (findRegisterToSaveLRTo(C)) { NumBytesNoStackCalls += 12; C.setCallInfo(MachineOutlinerRegSave, 12); CandidatesWithoutStackFixups.push_back(C); } // Is SP used in the sequence at all? If not, we don't have to modify // the stack, so we are guaranteed to get the same frame. else if (C.UsedInSequence.available(AArch64::SP)) { NumBytesNoStackCalls += 12; C.setCallInfo(MachineOutlinerDefault, 12); CandidatesWithoutStackFixups.push_back(C); } // If we outline this, we need to modify the stack. Pretend we don't // outline this by saving all of its bytes. else { NumBytesNoStackCalls += SequenceSize; } } // If there are no places where we have to save LR, then note that we // don't have to update the stack. Otherwise, give every candidate the // default call type, as long as it's safe to do so. if (!AllStackInstrsSafe || NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { RepeatedSequenceLocs = CandidatesWithoutStackFixups; FrameID = MachineOutlinerNoLRSave; } else { SetCandidateCallInfo(MachineOutlinerDefault, 12); } // If we dropped all of the candidates, bail out here. if (RepeatedSequenceLocs.size() < 2) { RepeatedSequenceLocs.clear(); return outliner::OutlinedFunction(); } } // Does every candidate's MBB contain a call? If so, then we might have a call // in the range. if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { // Check if the range contains a call. These require a save + restore of the // link register. bool ModStackToSaveLR = false; if (std::any_of(FirstCand.front(), FirstCand.back(), [](const MachineInstr &MI) { return MI.isCall(); })) ModStackToSaveLR = true; // Handle the last instruction separately. If this is a tail call, then the // last instruction is a call. We don't want to save + restore in this case. // However, it could be possible that the last instruction is a call without // it being valid to tail call this sequence. We should consider this as // well. else if (FrameID != MachineOutlinerThunk && FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) ModStackToSaveLR = true; if (ModStackToSaveLR) { // We can't fix up the stack. Bail out. if (!AllStackInstrsSafe) { RepeatedSequenceLocs.clear(); return outliner::OutlinedFunction(); } // Save + restore LR. NumBytesToCreateFrame += 8; } } return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID); } bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { const Function &F = MF.getFunction(); // Can F be deduplicated by the linker? If it can, don't outline from it. if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) return false; // Don't outline from functions with section markings; the program could // expect that all the code is in the named section. // FIXME: Allow outlining from multiple functions with the same section // marking. if (F.hasSection()) return false; // Outlining from functions with redzones is unsafe since the outliner may // modify the stack. Check if hasRedZone is true or unknown; if yes, don't // outline from it. AArch64FunctionInfo *AFI = MF.getInfo(); if (!AFI || AFI->hasRedZone().getValueOr(true)) return false; // It's safe to outline from MF. return true; } bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, unsigned &Flags) const { // Check if LR is available through all of the MBB. If it's not, then set // a flag. assert(MBB.getParent()->getRegInfo().tracksLiveness() && "Suitable Machine Function for outlining must track liveness"); LiveRegUnits LRU(getRegisterInfo()); std::for_each(MBB.rbegin(), MBB.rend(), [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); // Check if each of the unsafe registers are available... bool W16AvailableInBlock = LRU.available(AArch64::W16); bool W17AvailableInBlock = LRU.available(AArch64::W17); bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); // If all of these are dead (and not live out), we know we don't have to check // them later. if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; // Now, add the live outs to the set. LRU.addLiveOuts(MBB); // If any of these registers is available in the MBB, but also a live out of // the block, then we know outlining is unsafe. if (W16AvailableInBlock && !LRU.available(AArch64::W16)) return false; if (W17AvailableInBlock && !LRU.available(AArch64::W17)) return false; if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) return false; // Check if there's a call inside this MachineBasicBlock. If there is, then // set a flag. if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) Flags |= MachineOutlinerMBBFlags::HasCalls; MachineFunction *MF = MBB.getParent(); // In the event that we outline, we may have to save LR. If there is an // available register in the MBB, then we'll always save LR there. Check if // this is true. bool CanSaveLR = false; const AArch64RegisterInfo *ARI = static_cast( MF->getSubtarget().getRegisterInfo()); // Check if there is an available register across the sequence that we can // use. for (unsigned Reg : AArch64::GPR64RegClass) { if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { CanSaveLR = true; break; } } // Check if we have a register we can save LR to, and if LR was used // somewhere. If both of those things are true, then we need to evaluate the // safety of outlining stack instructions later. if (!CanSaveLR && !LRU.available(AArch64::LR)) Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; return true; } outliner::InstrType AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const { MachineInstr &MI = *MIT; MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); AArch64FunctionInfo *FuncInfo = MF->getInfo(); // Don't outline LOHs. if (FuncInfo->getLOHRelated().count(&MI)) return outliner::InstrType::Illegal; // Don't allow debug values to impact outlining type. if (MI.isDebugInstr() || MI.isIndirectDebugValue()) return outliner::InstrType::Invisible; // At this point, KILL instructions don't really tell us much so we can go // ahead and skip over them. if (MI.isKill()) return outliner::InstrType::Invisible; // Is this a terminator for a basic block? if (MI.isTerminator()) { // Is this the end of a function? if (MI.getParent()->succ_empty()) return outliner::InstrType::Legal; // It's not, so don't outline it. return outliner::InstrType::Illegal; } // Make sure none of the operands are un-outlinable. for (const MachineOperand &MOP : MI.operands()) { if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || MOP.isTargetIndex()) return outliner::InstrType::Illegal; // If it uses LR or W30 explicitly, then don't touch it. if (MOP.isReg() && !MOP.isImplicit() && (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) return outliner::InstrType::Illegal; } // Special cases for instructions that can always be outlined, but will fail // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always // be outlined because they don't require a *specific* value to be in LR. if (MI.getOpcode() == AArch64::ADRP) return outliner::InstrType::Legal; // If MI is a call we might be able to outline it. We don't want to outline // any calls that rely on the position of items on the stack. When we outline // something containing a call, we have to emit a save and restore of LR in // the outlined function. Currently, this always happens by saving LR to the // stack. Thus, if we outline, say, half the parameters for a function call // plus the call, then we'll break the callee's expectations for the layout // of the stack. // // FIXME: Allow calls to functions which construct a stack frame, as long // as they don't access arguments on the stack. // FIXME: Figure out some way to analyze functions defined in other modules. // We should be able to compute the memory usage based on the IR calling // convention, even if we can't see the definition. if (MI.isCall()) { // Get the function associated with the call. Look at each operand and find // the one that represents the callee and get its name. const Function *Callee = nullptr; for (const MachineOperand &MOP : MI.operands()) { if (MOP.isGlobal()) { Callee = dyn_cast(MOP.getGlobal()); break; } } // Never outline calls to mcount. There isn't any rule that would require // this, but the Linux kernel's "ftrace" feature depends on it. if (Callee && Callee->getName() == "\01_mcount") return outliner::InstrType::Illegal; // If we don't know anything about the callee, assume it depends on the // stack layout of the caller. In that case, it's only legal to outline // as a tail-call. Whitelist the call instructions we know about so we // don't get unexpected results with call pseudo-instructions. auto UnknownCallOutlineType = outliner::InstrType::Illegal; if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL) UnknownCallOutlineType = outliner::InstrType::LegalTerminator; if (!Callee) return UnknownCallOutlineType; // We have a function we have information about. Check it if it's something // can safely outline. MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); // We don't know what's going on with the callee at all. Don't touch it. if (!CalleeMF) return UnknownCallOutlineType; // Check if we know anything about the callee saves on the function. If we // don't, then don't touch it, since that implies that we haven't // computed anything about its stack frame yet. MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || MFI.getNumObjects() > 0) return UnknownCallOutlineType; // At this point, we can say that CalleeMF ought to not pass anything on the // stack. Therefore, we can outline it. return outliner::InstrType::Legal; } // Don't outline positions. if (MI.isPosition()) return outliner::InstrType::Illegal; // Don't touch the link register or W30. if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) return outliner::InstrType::Illegal; // Don't outline BTI instructions, because that will prevent the outlining // site from being indirectly callable. if (MI.getOpcode() == AArch64::HINT) { int64_t Imm = MI.getOperand(0).getImm(); if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) return outliner::InstrType::Illegal; } return outliner::InstrType::Legal; } void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { for (MachineInstr &MI : MBB) { const MachineOperand *Base; unsigned Width; int64_t Offset; // Is this a load or store with an immediate offset with SP as the base? if (!MI.mayLoadOrStore() || !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) || (Base->isReg() && Base->getReg() != AArch64::SP)) continue; // It is, so we have to fix it up. unsigned Scale; int64_t Dummy1, Dummy2; MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); assert(Scale != 0 && "Unexpected opcode!"); // We've pushed the return address to the stack, so add 16 to the offset. // This is safe, since we already checked if it would overflow when we // checked if this instruction was legal to outline. int64_t NewImm = (Offset + 16) / Scale; StackOffsetOperand.setImm(NewImm); } } void AArch64InstrInfo::buildOutlinedFrame( MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const { // For thunk outlining, rewrite the last instruction from a call to a // tail-call. if (OF.FrameConstructionID == MachineOutlinerThunk) { MachineInstr *Call = &*--MBB.instr_end(); unsigned TailOpcode; if (Call->getOpcode() == AArch64::BL) { TailOpcode = AArch64::TCRETURNdi; } else { assert(Call->getOpcode() == AArch64::BLR); TailOpcode = AArch64::TCRETURNriALL; } MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) .add(Call->getOperand(0)) .addImm(0); MBB.insert(MBB.end(), TC); Call->eraseFromParent(); } // Is there a call in the outlined range? auto IsNonTailCall = [](MachineInstr &MI) { return MI.isCall() && !MI.isReturn(); }; if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) { // Fix up the instructions in the range, since we're going to modify the // stack. assert(OF.FrameConstructionID != MachineOutlinerDefault && "Can only fix up stack references once"); fixupPostOutline(MBB); // LR has to be a live in so that we can save it. MBB.addLiveIn(AArch64::LR); MachineBasicBlock::iterator It = MBB.begin(); MachineBasicBlock::iterator Et = MBB.end(); if (OF.FrameConstructionID == MachineOutlinerTailCall || OF.FrameConstructionID == MachineOutlinerThunk) Et = std::prev(MBB.end()); // Insert a save before the outlined region MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) .addReg(AArch64::SP, RegState::Define) .addReg(AArch64::LR) .addReg(AArch64::SP) .addImm(-16); It = MBB.insert(It, STRXpre); const TargetSubtargetInfo &STI = MF.getSubtarget(); const MCRegisterInfo *MRI = STI.getRegisterInfo(); unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); // Add a CFI saying the stack was moved 16 B down. int64_t StackPosEntry = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16)); BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) .addCFIIndex(StackPosEntry) .setMIFlags(MachineInstr::FrameSetup); // Add a CFI saying that the LR that we want to find is now 16 B higher than // before. int64_t LRPosEntry = MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16)); BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) .addCFIIndex(LRPosEntry) .setMIFlags(MachineInstr::FrameSetup); // Insert a restore before the terminator for the function. MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) .addReg(AArch64::SP, RegState::Define) .addReg(AArch64::LR, RegState::Define) .addReg(AArch64::SP) .addImm(16); Et = MBB.insert(Et, LDRXpost); } // If this is a tail call outlined function, then there's already a return. if (OF.FrameConstructionID == MachineOutlinerTailCall || OF.FrameConstructionID == MachineOutlinerThunk) return; // It's not a tail call, so we have to insert the return ourselves. MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) .addReg(AArch64::LR, RegState::Undef); MBB.insert(MBB.end(), ret); // Did we have to modify the stack by saving the link register? if (OF.FrameConstructionID != MachineOutlinerDefault) return; // We modified the stack. // Walk over the basic block and fix up all the stack accesses. fixupPostOutline(MBB); } MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const { // Are we tail calling? if (C.CallConstructionID == MachineOutlinerTailCall) { // If yes, then we can just branch to the label. It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) .addGlobalAddress(M.getNamedValue(MF.getName())) .addImm(0)); return It; } // Are we saving the link register? if (C.CallConstructionID == MachineOutlinerNoLRSave || C.CallConstructionID == MachineOutlinerThunk) { // No, so just insert the call. It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) .addGlobalAddress(M.getNamedValue(MF.getName()))); return It; } // We want to return the spot where we inserted the call. MachineBasicBlock::iterator CallPt; // Instructions for saving and restoring LR around the call instruction we're // going to insert. MachineInstr *Save; MachineInstr *Restore; // Can we save to a register? if (C.CallConstructionID == MachineOutlinerRegSave) { // FIXME: This logic should be sunk into a target-specific interface so that // we don't have to recompute the register. unsigned Reg = findRegisterToSaveLRTo(C); assert(Reg != 0 && "No callee-saved register available?"); // Save and restore LR from that register. Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) .addReg(AArch64::XZR) .addReg(AArch64::LR) .addImm(0); Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) .addReg(AArch64::XZR) .addReg(Reg) .addImm(0); } else { // We have the default case. Save and restore from SP. Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) .addReg(AArch64::SP, RegState::Define) .addReg(AArch64::LR) .addReg(AArch64::SP) .addImm(-16); Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) .addReg(AArch64::SP, RegState::Define) .addReg(AArch64::LR, RegState::Define) .addReg(AArch64::SP) .addImm(16); } It = MBB.insert(It, Save); It++; // Insert the call. It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) .addGlobalAddress(M.getNamedValue(MF.getName()))); CallPt = It; It++; It = MBB.insert(It, Restore); return CallPt; } bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( MachineFunction &MF) const { return MF.getFunction().hasMinSize(); } bool AArch64InstrInfo::isCopyInstrImpl( const MachineInstr &MI, const MachineOperand *&Source, const MachineOperand *&Destination) const { // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg // and zero immediate operands used as an alias for mov instruction. if (MI.getOpcode() == AArch64::ORRWrs && MI.getOperand(1).getReg() == AArch64::WZR && MI.getOperand(3).getImm() == 0x0) { Destination = &MI.getOperand(0); Source = &MI.getOperand(2); return true; } if (MI.getOpcode() == AArch64::ORRXrs && MI.getOperand(1).getReg() == AArch64::XZR && MI.getOperand(3).getImm() == 0x0) { Destination = &MI.getOperand(0); Source = &MI.getOperand(2); return true; } return false; } #define GET_INSTRINFO_HELPERS #include "AArch64GenInstrInfo.inc" Index: vendor/llvm/dist-release_90/lib/Target/Mips/AsmParser/MipsAsmParser.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Target/Mips/AsmParser/MipsAsmParser.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/Mips/AsmParser/MipsAsmParser.cpp (revision 351709) @@ -1,8467 +1,8484 @@ //===-- MipsAsmParser.cpp - Parse Mips assembly to MCInst instructions ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "MCTargetDesc/MipsABIFlagsSection.h" #include "MCTargetDesc/MipsABIInfo.h" #include "MCTargetDesc/MipsBaseInfo.h" #include "MCTargetDesc/MipsMCExpr.h" #include "MCTargetDesc/MipsMCTargetDesc.h" #include "MipsTargetStreamer.h" #include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCAsmParserExtension.h" #include "llvm/MC/MCParser/MCAsmParserUtils.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "mips-asm-parser" namespace llvm { class MCInstrInfo; } // end namespace llvm extern cl::opt EmitJalrReloc; namespace { class MipsAssemblerOptions { public: MipsAssemblerOptions(const FeatureBitset &Features_) : Features(Features_) {} MipsAssemblerOptions(const MipsAssemblerOptions *Opts) { ATReg = Opts->getATRegIndex(); Reorder = Opts->isReorder(); Macro = Opts->isMacro(); Features = Opts->getFeatures(); } unsigned getATRegIndex() const { return ATReg; } bool setATRegIndex(unsigned Reg) { if (Reg > 31) return false; ATReg = Reg; return true; } bool isReorder() const { return Reorder; } void setReorder() { Reorder = true; } void setNoReorder() { Reorder = false; } bool isMacro() const { return Macro; } void setMacro() { Macro = true; } void setNoMacro() { Macro = false; } const FeatureBitset &getFeatures() const { return Features; } void setFeatures(const FeatureBitset &Features_) { Features = Features_; } // Set of features that are either architecture features or referenced // by them (e.g.: FeatureNaN2008 implied by FeatureMips32r6). // The full table can be found in MipsGenSubtargetInfo.inc (MipsFeatureKV[]). // The reason we need this mask is explained in the selectArch function. // FIXME: Ideally we would like TableGen to generate this information. static const FeatureBitset AllArchRelatedMask; private: unsigned ATReg = 1; bool Reorder = true; bool Macro = true; FeatureBitset Features; }; } // end anonymous namespace const FeatureBitset MipsAssemblerOptions::AllArchRelatedMask = { Mips::FeatureMips1, Mips::FeatureMips2, Mips::FeatureMips3, Mips::FeatureMips3_32, Mips::FeatureMips3_32r2, Mips::FeatureMips4, Mips::FeatureMips4_32, Mips::FeatureMips4_32r2, Mips::FeatureMips5, Mips::FeatureMips5_32r2, Mips::FeatureMips32, Mips::FeatureMips32r2, Mips::FeatureMips32r3, Mips::FeatureMips32r5, Mips::FeatureMips32r6, Mips::FeatureMips64, Mips::FeatureMips64r2, Mips::FeatureMips64r3, Mips::FeatureMips64r5, Mips::FeatureMips64r6, Mips::FeatureCnMips, Mips::FeatureFP64Bit, Mips::FeatureGP64Bit, Mips::FeatureNaN2008 }; namespace { class MipsAsmParser : public MCTargetAsmParser { MipsTargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); return static_cast(TS); } MipsABIInfo ABI; SmallVector, 2> AssemblerOptions; MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a // nullptr, which indicates that no function is currently // selected. This usually happens after an '.end func' // directive. bool IsLittleEndian; bool IsPicEnabled; bool IsCpRestoreSet; int CpRestoreOffset; unsigned GPReg; unsigned CpSaveLocation; /// If true, then CpSaveLocation is a register, otherwise it's an offset. bool CpSaveLocationIsRegister; // Map of register aliases created via the .set directive. StringMap RegisterSets; // Print a warning along with its fix-it message at the given range. void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg, SMRange Range, bool ShowColors = true); void ConvertXWPOperands(MCInst &Inst, const OperandVector &Operands); #define GET_ASSEMBLER_HEADER #include "MipsGenAsmMatcher.inc" unsigned checkEarlyTargetMatchPredicate(MCInst &Inst, const OperandVector &Operands) override; unsigned checkTargetMatchPredicate(MCInst &Inst) override; bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; /// Parse a register as used in CFI directives bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; bool parseParenSuffix(StringRef Name, OperandVector &Operands); bool parseBracketSuffix(StringRef Name, OperandVector &Operands); bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID); bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; OperandMatchResultTy parseMemOperand(OperandVector &Operands); OperandMatchResultTy matchAnyRegisterNameWithoutDollar(OperandVector &Operands, StringRef Identifier, SMLoc S); OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands, const AsmToken &Token, SMLoc S); OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S); OperandMatchResultTy parseAnyRegister(OperandVector &Operands); OperandMatchResultTy parseImm(OperandVector &Operands); OperandMatchResultTy parseJumpTarget(OperandVector &Operands); OperandMatchResultTy parseInvNum(OperandVector &Operands); OperandMatchResultTy parseRegisterList(OperandVector &Operands); bool searchSymbolAlias(OperandVector &Operands); bool parseOperand(OperandVector &, StringRef Mnemonic); enum MacroExpanderResultTy { MER_NotAMacro, MER_Success, MER_Fail, }; // Expands assembly pseudo instructions. MacroExpanderResultTy tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg, bool Is32BitImm, bool IsAddress, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg, unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc, MCSymbol *Sym); bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, bool Is64FPU, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandLoadAddress(unsigned DstReg, unsigned BaseReg, const MCOperand &Offset, bool Is32BitAddress, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); void expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI, bool IsLoad); bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI, const bool IsMips64, const bool Signed); bool expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandUsh(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandSge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI, bool IsLoad); bool expandStoreDM1Macro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool reportParseError(Twine ErrorMsg); bool reportParseError(SMLoc Loc, Twine ErrorMsg); bool parseMemOffset(const MCExpr *&Res, bool isParenExpr); bool isEvaluated(const MCExpr *Expr); bool parseSetMips0Directive(); bool parseSetArchDirective(); bool parseSetFeature(uint64_t Feature); bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup. bool parseDirectiveCpLoad(SMLoc Loc); bool parseDirectiveCpLocal(SMLoc Loc); bool parseDirectiveCpRestore(SMLoc Loc); bool parseDirectiveCPSetup(); bool parseDirectiveCPReturn(); bool parseDirectiveNaN(); bool parseDirectiveSet(); bool parseDirectiveOption(); bool parseInsnDirective(); bool parseRSectionDirective(StringRef Section); bool parseSSectionDirective(StringRef Section, unsigned Type); bool parseSetAtDirective(); bool parseSetNoAtDirective(); bool parseSetMacroDirective(); bool parseSetNoMacroDirective(); bool parseSetMsaDirective(); bool parseSetNoMsaDirective(); bool parseSetNoDspDirective(); bool parseSetReorderDirective(); bool parseSetNoReorderDirective(); bool parseSetMips16Directive(); bool parseSetNoMips16Directive(); bool parseSetFpDirective(); bool parseSetOddSPRegDirective(); bool parseSetNoOddSPRegDirective(); bool parseSetPopDirective(); bool parseSetPushDirective(); bool parseSetSoftFloatDirective(); bool parseSetHardFloatDirective(); bool parseSetMtDirective(); bool parseSetNoMtDirective(); bool parseSetNoCRCDirective(); bool parseSetNoVirtDirective(); bool parseSetNoGINVDirective(); bool parseSetAssignment(); bool parseDirectiveGpWord(); bool parseDirectiveGpDWord(); bool parseDirectiveDtpRelWord(); bool parseDirectiveDtpRelDWord(); bool parseDirectiveTpRelWord(); bool parseDirectiveTpRelDWord(); bool parseDirectiveModule(); bool parseDirectiveModuleFP(); bool parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI, StringRef Directive); bool parseInternalDirectiveReallowModule(); bool eatComma(StringRef ErrorStr); int matchCPURegisterName(StringRef Symbol); int matchHWRegsRegisterName(StringRef Symbol); int matchFPURegisterName(StringRef Name); int matchFCCRegisterName(StringRef Name); int matchACRegisterName(StringRef Name); int matchMSA128RegisterName(StringRef Name); int matchMSA128CtrlRegisterName(StringRef Name); unsigned getReg(int RC, int RegNo); /// Returns the internal register number for the current AT. Also checks if /// the current AT is unavailable (set to $0) and gives an error if it is. /// This should be used in pseudo-instruction expansions which need AT. unsigned getATReg(SMLoc Loc); bool canUseATReg(); bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); // Helper function that checks if the value of a vector index is within the // boundaries of accepted values for each RegisterKind // Example: INSERT.B $w0[n], $1 => 16 > n >= 0 bool validateMSAIndex(int Val, int RegKind); // Selects a new architecture by updating the FeatureBits with the necessary // info including implied dependencies. // Internally, it clears all the feature bits related to *any* architecture // and selects the new one using the ToggleFeature functionality of the // MCSubtargetInfo object that handles implied dependencies. The reason we // clear all the arch related bits manually is because ToggleFeature only // clears the features that imply the feature being cleared and not the // features implied by the feature being cleared. This is easier to see // with an example: // -------------------------------------------------- // | Feature | Implies | // | -------------------------------------------------| // | FeatureMips1 | None | // | FeatureMips2 | FeatureMips1 | // | FeatureMips3 | FeatureMips2 | FeatureMipsGP64 | // | FeatureMips4 | FeatureMips3 | // | ... | | // -------------------------------------------------- // // Setting Mips3 is equivalent to set: (FeatureMips3 | FeatureMips2 | // FeatureMipsGP64 | FeatureMips1) // Clearing Mips3 is equivalent to clear (FeatureMips3 | FeatureMips4). void selectArch(StringRef ArchFeature) { MCSubtargetInfo &STI = copySTI(); FeatureBitset FeatureBits = STI.getFeatureBits(); FeatureBits &= ~MipsAssemblerOptions::AllArchRelatedMask; STI.setFeatureBits(FeatureBits); setAvailableFeatures( ComputeAvailableFeatures(STI.ToggleFeature(ArchFeature))); AssemblerOptions.back()->setFeatures(STI.getFeatureBits()); } void setFeatureBits(uint64_t Feature, StringRef FeatureString) { if (!(getSTI().getFeatureBits()[Feature])) { MCSubtargetInfo &STI = copySTI(); setAvailableFeatures( ComputeAvailableFeatures(STI.ToggleFeature(FeatureString))); AssemblerOptions.back()->setFeatures(STI.getFeatureBits()); } } void clearFeatureBits(uint64_t Feature, StringRef FeatureString) { if (getSTI().getFeatureBits()[Feature]) { MCSubtargetInfo &STI = copySTI(); setAvailableFeatures( ComputeAvailableFeatures(STI.ToggleFeature(FeatureString))); AssemblerOptions.back()->setFeatures(STI.getFeatureBits()); } } void setModuleFeatureBits(uint64_t Feature, StringRef FeatureString) { setFeatureBits(Feature, FeatureString); AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits()); } void clearModuleFeatureBits(uint64_t Feature, StringRef FeatureString) { clearFeatureBits(Feature, FeatureString); AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits()); } public: enum MipsMatchResultTy { Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY, Match_RequiresDifferentOperands, Match_RequiresNoZeroRegister, Match_RequiresSameSrcAndDst, Match_NoFCCRegisterForCurrentISA, Match_NonZeroOperandForSync, Match_NonZeroOperandForMTCX, Match_RequiresPosSizeRange0_32, Match_RequiresPosSizeRange33_64, Match_RequiresPosSizeUImm6, #define GET_OPERAND_DIAGNOSTIC_TYPES #include "MipsGenAsmMatcher.inc" #undef GET_OPERAND_DIAGNOSTIC_TYPES }; MipsAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser, const MCInstrInfo &MII, const MCTargetOptions &Options) : MCTargetAsmParser(Options, sti, MII), ABI(MipsABIInfo::computeTargetABI(Triple(sti.getTargetTriple()), sti.getCPU(), Options)) { MCAsmParserExtension::Initialize(parser); parser.addAliasForDirective(".asciiz", ".asciz"); parser.addAliasForDirective(".hword", ".2byte"); parser.addAliasForDirective(".word", ".4byte"); parser.addAliasForDirective(".dword", ".8byte"); // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); // Remember the initial assembler options. The user can not modify these. AssemblerOptions.push_back( llvm::make_unique(getSTI().getFeatureBits())); // Create an assembler options environment for the user to modify. AssemblerOptions.push_back( llvm::make_unique(getSTI().getFeatureBits())); getTargetStreamer().updateABIInfo(*this); if (!isABI_O32() && !useOddSPReg() != 0) report_fatal_error("-mno-odd-spreg requires the O32 ABI"); CurrentFn = nullptr; IsPicEnabled = getContext().getObjectFileInfo()->isPositionIndependent(); IsCpRestoreSet = false; CpRestoreOffset = -1; GPReg = ABI.GetGlobalPtr(); const Triple &TheTriple = sti.getTargetTriple(); IsLittleEndian = TheTriple.isLittleEndian(); if (getSTI().getCPU() == "mips64r6" && inMicroMipsMode()) report_fatal_error("microMIPS64R6 is not supported", false); if (!isABI_O32() && inMicroMipsMode()) report_fatal_error("microMIPS64 is not supported", false); } /// True if all of $fcc0 - $fcc7 exist for the current ISA. bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); } bool isGP64bit() const { return getSTI().getFeatureBits()[Mips::FeatureGP64Bit]; } bool isFP64bit() const { return getSTI().getFeatureBits()[Mips::FeatureFP64Bit]; } const MipsABIInfo &getABI() const { return ABI; } bool isABI_N32() const { return ABI.IsN32(); } bool isABI_N64() const { return ABI.IsN64(); } bool isABI_O32() const { return ABI.IsO32(); } bool isABI_FPXX() const { return getSTI().getFeatureBits()[Mips::FeatureFPXX]; } bool useOddSPReg() const { return !(getSTI().getFeatureBits()[Mips::FeatureNoOddSPReg]); } bool inMicroMipsMode() const { return getSTI().getFeatureBits()[Mips::FeatureMicroMips]; } bool hasMips1() const { return getSTI().getFeatureBits()[Mips::FeatureMips1]; } bool hasMips2() const { return getSTI().getFeatureBits()[Mips::FeatureMips2]; } bool hasMips3() const { return getSTI().getFeatureBits()[Mips::FeatureMips3]; } bool hasMips4() const { return getSTI().getFeatureBits()[Mips::FeatureMips4]; } bool hasMips5() const { return getSTI().getFeatureBits()[Mips::FeatureMips5]; } bool hasMips32() const { return getSTI().getFeatureBits()[Mips::FeatureMips32]; } bool hasMips64() const { return getSTI().getFeatureBits()[Mips::FeatureMips64]; } bool hasMips32r2() const { return getSTI().getFeatureBits()[Mips::FeatureMips32r2]; } bool hasMips64r2() const { return getSTI().getFeatureBits()[Mips::FeatureMips64r2]; } bool hasMips32r3() const { return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]); } bool hasMips64r3() const { return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]); } bool hasMips32r5() const { return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]); } bool hasMips64r5() const { return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]); } bool hasMips32r6() const { return getSTI().getFeatureBits()[Mips::FeatureMips32r6]; } bool hasMips64r6() const { return getSTI().getFeatureBits()[Mips::FeatureMips64r6]; } bool hasDSP() const { return getSTI().getFeatureBits()[Mips::FeatureDSP]; } bool hasDSPR2() const { return getSTI().getFeatureBits()[Mips::FeatureDSPR2]; } bool hasDSPR3() const { return getSTI().getFeatureBits()[Mips::FeatureDSPR3]; } bool hasMSA() const { return getSTI().getFeatureBits()[Mips::FeatureMSA]; } bool hasCnMips() const { return (getSTI().getFeatureBits()[Mips::FeatureCnMips]); } bool inPicMode() { return IsPicEnabled; } bool inMips16Mode() const { return getSTI().getFeatureBits()[Mips::FeatureMips16]; } bool useTraps() const { return getSTI().getFeatureBits()[Mips::FeatureUseTCCInDIV]; } bool useSoftFloat() const { return getSTI().getFeatureBits()[Mips::FeatureSoftFloat]; } bool hasMT() const { return getSTI().getFeatureBits()[Mips::FeatureMT]; } bool hasCRC() const { return getSTI().getFeatureBits()[Mips::FeatureCRC]; } bool hasVirt() const { return getSTI().getFeatureBits()[Mips::FeatureVirt]; } bool hasGINV() const { return getSTI().getFeatureBits()[Mips::FeatureGINV]; } /// Warn if RegIndex is the same as the current AT. void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc); void warnIfNoMacro(SMLoc Loc); bool isLittle() const { return IsLittleEndian; } const MCExpr *createTargetUnaryExpr(const MCExpr *E, AsmToken::TokenKind OperatorToken, MCContext &Ctx) override { switch(OperatorToken) { default: llvm_unreachable("Unknown token"); return nullptr; case AsmToken::PercentCall16: return MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, E, Ctx); case AsmToken::PercentCall_Hi: return MipsMCExpr::create(MipsMCExpr::MEK_CALL_HI16, E, Ctx); case AsmToken::PercentCall_Lo: return MipsMCExpr::create(MipsMCExpr::MEK_CALL_LO16, E, Ctx); case AsmToken::PercentDtprel_Hi: return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL_HI, E, Ctx); case AsmToken::PercentDtprel_Lo: return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL_LO, E, Ctx); case AsmToken::PercentGot: return MipsMCExpr::create(MipsMCExpr::MEK_GOT, E, Ctx); case AsmToken::PercentGot_Disp: return MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, E, Ctx); case AsmToken::PercentGot_Hi: return MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, E, Ctx); case AsmToken::PercentGot_Lo: return MipsMCExpr::create(MipsMCExpr::MEK_GOT_LO16, E, Ctx); case AsmToken::PercentGot_Ofst: return MipsMCExpr::create(MipsMCExpr::MEK_GOT_OFST, E, Ctx); case AsmToken::PercentGot_Page: return MipsMCExpr::create(MipsMCExpr::MEK_GOT_PAGE, E, Ctx); case AsmToken::PercentGottprel: return MipsMCExpr::create(MipsMCExpr::MEK_GOTTPREL, E, Ctx); case AsmToken::PercentGp_Rel: return MipsMCExpr::create(MipsMCExpr::MEK_GPREL, E, Ctx); case AsmToken::PercentHi: return MipsMCExpr::create(MipsMCExpr::MEK_HI, E, Ctx); case AsmToken::PercentHigher: return MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, E, Ctx); case AsmToken::PercentHighest: return MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, E, Ctx); case AsmToken::PercentLo: return MipsMCExpr::create(MipsMCExpr::MEK_LO, E, Ctx); case AsmToken::PercentNeg: return MipsMCExpr::create(MipsMCExpr::MEK_NEG, E, Ctx); case AsmToken::PercentPcrel_Hi: return MipsMCExpr::create(MipsMCExpr::MEK_PCREL_HI16, E, Ctx); case AsmToken::PercentPcrel_Lo: return MipsMCExpr::create(MipsMCExpr::MEK_PCREL_LO16, E, Ctx); case AsmToken::PercentTlsgd: return MipsMCExpr::create(MipsMCExpr::MEK_TLSGD, E, Ctx); case AsmToken::PercentTlsldm: return MipsMCExpr::create(MipsMCExpr::MEK_TLSLDM, E, Ctx); case AsmToken::PercentTprel_Hi: return MipsMCExpr::create(MipsMCExpr::MEK_TPREL_HI, E, Ctx); case AsmToken::PercentTprel_Lo: return MipsMCExpr::create(MipsMCExpr::MEK_TPREL_LO, E, Ctx); } } }; /// MipsOperand - Instances of this class represent a parsed Mips machine /// instruction. class MipsOperand : public MCParsedAsmOperand { public: /// Broad categories of register classes /// The exact class is finalized by the render method. enum RegKind { RegKind_GPR = 1, /// GPR32 and GPR64 (depending on isGP64bit()) RegKind_FGR = 2, /// FGR32, FGR64, AFGR64 (depending on context and /// isFP64bit()) RegKind_FCC = 4, /// FCC RegKind_MSA128 = 8, /// MSA128[BHWD] (makes no difference which) RegKind_MSACtrl = 16, /// MSA control registers RegKind_COP2 = 32, /// COP2 RegKind_ACC = 64, /// HI32DSP, LO32DSP, and ACC64DSP (depending on /// context). RegKind_CCR = 128, /// CCR RegKind_HWRegs = 256, /// HWRegs RegKind_COP3 = 512, /// COP3 RegKind_COP0 = 1024, /// COP0 /// Potentially any (e.g. $1) RegKind_Numeric = RegKind_GPR | RegKind_FGR | RegKind_FCC | RegKind_MSA128 | RegKind_MSACtrl | RegKind_COP2 | RegKind_ACC | RegKind_CCR | RegKind_HWRegs | RegKind_COP3 | RegKind_COP0 }; private: enum KindTy { k_Immediate, /// An immediate (possibly involving symbol references) k_Memory, /// Base + Offset Memory Address k_RegisterIndex, /// A register index in one or more RegKind. k_Token, /// A simple token k_RegList, /// A physical register list } Kind; public: MipsOperand(KindTy K, MipsAsmParser &Parser) : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {} ~MipsOperand() override { switch (Kind) { case k_Memory: delete Mem.Base; break; case k_RegList: delete RegList.List; break; case k_Immediate: case k_RegisterIndex: case k_Token: break; } } private: /// For diagnostics, and checking the assembler temporary MipsAsmParser &AsmParser; struct Token { const char *Data; unsigned Length; }; struct RegIdxOp { unsigned Index; /// Index into the register class RegKind Kind; /// Bitfield of the kinds it could possibly be struct Token Tok; /// The input token this operand originated from. const MCRegisterInfo *RegInfo; }; struct ImmOp { const MCExpr *Val; }; struct MemOp { MipsOperand *Base; const MCExpr *Off; }; struct RegListOp { SmallVector *List; }; union { struct Token Tok; struct RegIdxOp RegIdx; struct ImmOp Imm; struct MemOp Mem; struct RegListOp RegList; }; SMLoc StartLoc, EndLoc; /// Internal constructor for register kinds static std::unique_ptr CreateReg(unsigned Index, StringRef Str, RegKind RegKind, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { auto Op = llvm::make_unique(k_RegisterIndex, Parser); Op->RegIdx.Index = Index; Op->RegIdx.RegInfo = RegInfo; Op->RegIdx.Kind = RegKind; Op->RegIdx.Tok.Data = Str.data(); Op->RegIdx.Tok.Length = Str.size(); Op->StartLoc = S; Op->EndLoc = E; return Op; } public: /// Coerce the register to GPR32 and return the real register for the current /// target. unsigned getGPR32Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!"); AsmParser.warnIfRegIndexIsAT(RegIdx.Index, StartLoc); unsigned ClassID = Mips::GPR32RegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to GPR32 and return the real register for the current /// target. unsigned getGPRMM16Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!"); unsigned ClassID = Mips::GPR32RegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to GPR64 and return the real register for the current /// target. unsigned getGPR64Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!"); unsigned ClassID = Mips::GPR64RegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } private: /// Coerce the register to AFGR64 and return the real register for the current /// target. unsigned getAFGR64Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!"); if (RegIdx.Index % 2 != 0) AsmParser.Warning(StartLoc, "Float register should be even."); return RegIdx.RegInfo->getRegClass(Mips::AFGR64RegClassID) .getRegister(RegIdx.Index / 2); } /// Coerce the register to FGR64 and return the real register for the current /// target. unsigned getFGR64Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!"); return RegIdx.RegInfo->getRegClass(Mips::FGR64RegClassID) .getRegister(RegIdx.Index); } /// Coerce the register to FGR32 and return the real register for the current /// target. unsigned getFGR32Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!"); return RegIdx.RegInfo->getRegClass(Mips::FGR32RegClassID) .getRegister(RegIdx.Index); } /// Coerce the register to FCC and return the real register for the current /// target. unsigned getFCCReg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_FCC) && "Invalid access!"); return RegIdx.RegInfo->getRegClass(Mips::FCCRegClassID) .getRegister(RegIdx.Index); } /// Coerce the register to MSA128 and return the real register for the current /// target. unsigned getMSA128Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_MSA128) && "Invalid access!"); // It doesn't matter which of the MSA128[BHWD] classes we use. They are all // identical unsigned ClassID = Mips::MSA128BRegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to MSACtrl and return the real register for the /// current target. unsigned getMSACtrlReg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_MSACtrl) && "Invalid access!"); unsigned ClassID = Mips::MSACtrlRegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to COP0 and return the real register for the /// current target. unsigned getCOP0Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_COP0) && "Invalid access!"); unsigned ClassID = Mips::COP0RegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to COP2 and return the real register for the /// current target. unsigned getCOP2Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_COP2) && "Invalid access!"); unsigned ClassID = Mips::COP2RegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to COP3 and return the real register for the /// current target. unsigned getCOP3Reg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_COP3) && "Invalid access!"); unsigned ClassID = Mips::COP3RegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to ACC64DSP and return the real register for the /// current target. unsigned getACC64DSPReg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!"); unsigned ClassID = Mips::ACC64DSPRegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to HI32DSP and return the real register for the /// current target. unsigned getHI32DSPReg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!"); unsigned ClassID = Mips::HI32DSPRegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to LO32DSP and return the real register for the /// current target. unsigned getLO32DSPReg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!"); unsigned ClassID = Mips::LO32DSPRegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to CCR and return the real register for the /// current target. unsigned getCCRReg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_CCR) && "Invalid access!"); unsigned ClassID = Mips::CCRRegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } /// Coerce the register to HWRegs and return the real register for the /// current target. unsigned getHWRegsReg() const { assert(isRegIdx() && (RegIdx.Kind & RegKind_HWRegs) && "Invalid access!"); unsigned ClassID = Mips::HWRegsRegClassID; return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index); } public: void addExpr(MCInst &Inst, const MCExpr *Expr) const { // Add as immediate when possible. Null MCExpr = 0. if (!Expr) Inst.addOperand(MCOperand::createImm(0)); else if (const MCConstantExpr *CE = dyn_cast(Expr)) Inst.addOperand(MCOperand::createImm(CE->getValue())); else Inst.addOperand(MCOperand::createExpr(Expr)); } void addRegOperands(MCInst &Inst, unsigned N) const { llvm_unreachable("Use a custom parser instead"); } /// Render the operand to an MCInst as a GPR32 /// Asserts if the wrong number of operands are requested, or the operand /// is not a k_RegisterIndex compatible with RegKind_GPR void addGPR32ZeroAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getGPR32Reg())); } void addGPR32NonZeroAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getGPR32Reg())); } void addGPR32AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getGPR32Reg())); } void addGPRMM16AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getGPRMM16Reg())); } void addGPRMM16AsmRegZeroOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getGPRMM16Reg())); } void addGPRMM16AsmRegMovePOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getGPRMM16Reg())); } void addGPRMM16AsmRegMovePPairFirstOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getGPRMM16Reg())); } void addGPRMM16AsmRegMovePPairSecondOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getGPRMM16Reg())); } /// Render the operand to an MCInst as a GPR64 /// Asserts if the wrong number of operands are requested, or the operand /// is not a k_RegisterIndex compatible with RegKind_GPR void addGPR64AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getGPR64Reg())); } void addAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getAFGR64Reg())); } void addStrictlyAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getAFGR64Reg())); } void addStrictlyFGR64AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getFGR64Reg())); } void addFGR64AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getFGR64Reg())); } void addFGR32AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getFGR32Reg())); // FIXME: We ought to do this for -integrated-as without -via-file-asm too. // FIXME: This should propagate failure up to parseStatement. if (!AsmParser.useOddSPReg() && RegIdx.Index & 1) AsmParser.getParser().printError( StartLoc, "-mno-odd-spreg prohibits the use of odd FPU " "registers"); } void addStrictlyFGR32AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getFGR32Reg())); // FIXME: We ought to do this for -integrated-as without -via-file-asm too. if (!AsmParser.useOddSPReg() && RegIdx.Index & 1) AsmParser.Error(StartLoc, "-mno-odd-spreg prohibits the use of odd FPU " "registers"); } void addFCCAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getFCCReg())); } void addMSA128AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getMSA128Reg())); } void addMSACtrlAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getMSACtrlReg())); } void addCOP0AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getCOP0Reg())); } void addCOP2AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getCOP2Reg())); } void addCOP3AsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getCOP3Reg())); } void addACC64DSPAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getACC64DSPReg())); } void addHI32DSPAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getHI32DSPReg())); } void addLO32DSPAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getLO32DSPReg())); } void addCCRAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getCCRReg())); } void addHWRegsAsmRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getHWRegsReg())); } template void addConstantUImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); uint64_t Imm = getConstantImm() - Offset; Imm &= (1ULL << Bits) - 1; Imm += Offset; Imm += AdjustOffset; Inst.addOperand(MCOperand::createImm(Imm)); } template void addSImmOperands(MCInst &Inst, unsigned N) const { if (isImm() && !isConstantImm()) { addExpr(Inst, getImm()); return; } addConstantSImmOperands(Inst, N); } template void addUImmOperands(MCInst &Inst, unsigned N) const { if (isImm() && !isConstantImm()) { addExpr(Inst, getImm()); return; } addConstantUImmOperands(Inst, N); } template void addConstantSImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); int64_t Imm = getConstantImm() - Offset; Imm = SignExtend64(Imm); Imm += Offset; Imm += AdjustOffset; Inst.addOperand(MCOperand::createImm(Imm)); } void addImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCExpr *Expr = getImm(); addExpr(Inst, Expr); } void addMemOperands(MCInst &Inst, unsigned N) const { assert(N == 2 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(AsmParser.getABI().ArePtrs64bit() ? getMemBase()->getGPR64Reg() : getMemBase()->getGPR32Reg())); const MCExpr *Expr = getMemOff(); addExpr(Inst, Expr); } void addMicroMipsMemOperands(MCInst &Inst, unsigned N) const { assert(N == 2 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getMemBase()->getGPRMM16Reg())); const MCExpr *Expr = getMemOff(); addExpr(Inst, Expr); } void addRegListOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); for (auto RegNo : getRegList()) Inst.addOperand(MCOperand::createReg(RegNo)); } bool isReg() const override { // As a special case until we sort out the definition of div/divu, accept // $0/$zero here so that MCK_ZERO works correctly. return isGPRAsmReg() && RegIdx.Index == 0; } bool isRegIdx() const { return Kind == k_RegisterIndex; } bool isImm() const override { return Kind == k_Immediate; } bool isConstantImm() const { int64_t Res; return isImm() && getImm()->evaluateAsAbsolute(Res); } bool isConstantImmz() const { return isConstantImm() && getConstantImm() == 0; } template bool isConstantUImm() const { return isConstantImm() && isUInt(getConstantImm() - Offset); } template bool isSImm() const { return isConstantImm() ? isInt(getConstantImm()) : isImm(); } template bool isUImm() const { return isConstantImm() ? isUInt(getConstantImm()) : isImm(); } template bool isAnyImm() const { return isConstantImm() ? (isInt(getConstantImm()) || isUInt(getConstantImm())) : isImm(); } template bool isConstantSImm() const { return isConstantImm() && isInt(getConstantImm() - Offset); } template bool isConstantUImmRange() const { return isConstantImm() && getConstantImm() >= Bottom && getConstantImm() <= Top; } bool isToken() const override { // Note: It's not possible to pretend that other operand kinds are tokens. // The matcher emitter checks tokens first. return Kind == k_Token; } bool isMem() const override { return Kind == k_Memory; } bool isConstantMemOff() const { return isMem() && isa(getMemOff()); } // Allow relocation operators. // FIXME: This predicate and others need to look through binary expressions // and determine whether a Value is a constant or not. template bool isMemWithSimmOffset() const { if (!isMem()) return false; if (!getMemBase()->isGPRAsmReg()) return false; if (isa(getMemOff()) || (isConstantMemOff() && isShiftedInt(getConstantMemOff()))) return true; MCValue Res; bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr); return IsReloc && isShiftedInt(Res.getConstant()); } bool isMemWithPtrSizeOffset() const { if (!isMem()) return false; if (!getMemBase()->isGPRAsmReg()) return false; const unsigned PtrBits = AsmParser.getABI().ArePtrs64bit() ? 64 : 32; if (isa(getMemOff()) || (isConstantMemOff() && isIntN(PtrBits, getConstantMemOff()))) return true; MCValue Res; bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr); return IsReloc && isIntN(PtrBits, Res.getConstant()); } bool isMemWithGRPMM16Base() const { return isMem() && getMemBase()->isMM16AsmReg(); } template bool isMemWithUimmOffsetSP() const { return isMem() && isConstantMemOff() && isUInt(getConstantMemOff()) && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP); } template bool isMemWithUimmWordAlignedOffsetSP() const { return isMem() && isConstantMemOff() && isUInt(getConstantMemOff()) && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP); } template bool isMemWithSimmWordAlignedOffsetGP() const { return isMem() && isConstantMemOff() && isInt(getConstantMemOff()) && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::GP); } template bool isScaledUImm() const { return isConstantImm() && isShiftedUInt(getConstantImm()); } template bool isScaledSImm() const { if (isConstantImm() && isShiftedInt(getConstantImm())) return true; // Operand can also be a symbol or symbol plus // offset in case of relocations. if (Kind != k_Immediate) return false; MCValue Res; bool Success = getImm()->evaluateAsRelocatable(Res, nullptr, nullptr); return Success && isShiftedInt(Res.getConstant()); } bool isRegList16() const { if (!isRegList()) return false; int Size = RegList.List->size(); if (Size < 2 || Size > 5) return false; unsigned R0 = RegList.List->front(); unsigned R1 = RegList.List->back(); if (!((R0 == Mips::S0 && R1 == Mips::RA) || (R0 == Mips::S0_64 && R1 == Mips::RA_64))) return false; int PrevReg = *RegList.List->begin(); for (int i = 1; i < Size - 1; i++) { int Reg = (*(RegList.List))[i]; if ( Reg != PrevReg + 1) return false; PrevReg = Reg; } return true; } bool isInvNum() const { return Kind == k_Immediate; } bool isLSAImm() const { if (!isConstantImm()) return false; int64_t Val = getConstantImm(); return 1 <= Val && Val <= 4; } bool isRegList() const { return Kind == k_RegList; } StringRef getToken() const { assert(Kind == k_Token && "Invalid access!"); return StringRef(Tok.Data, Tok.Length); } unsigned getReg() const override { // As a special case until we sort out the definition of div/divu, accept // $0/$zero here so that MCK_ZERO works correctly. if (Kind == k_RegisterIndex && RegIdx.Index == 0 && RegIdx.Kind & RegKind_GPR) return getGPR32Reg(); // FIXME: GPR64 too llvm_unreachable("Invalid access!"); return 0; } const MCExpr *getImm() const { assert((Kind == k_Immediate) && "Invalid access!"); return Imm.Val; } int64_t getConstantImm() const { const MCExpr *Val = getImm(); int64_t Value = 0; (void)Val->evaluateAsAbsolute(Value); return Value; } MipsOperand *getMemBase() const { assert((Kind == k_Memory) && "Invalid access!"); return Mem.Base; } const MCExpr *getMemOff() const { assert((Kind == k_Memory) && "Invalid access!"); return Mem.Off; } int64_t getConstantMemOff() const { return static_cast(getMemOff())->getValue(); } const SmallVectorImpl &getRegList() const { assert((Kind == k_RegList) && "Invalid access!"); return *(RegList.List); } static std::unique_ptr CreateToken(StringRef Str, SMLoc S, MipsAsmParser &Parser) { auto Op = llvm::make_unique(k_Token, Parser); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->StartLoc = S; Op->EndLoc = S; return Op; } /// Create a numeric register (e.g. $1). The exact register remains /// unresolved until an instruction successfully matches static std::unique_ptr createNumericReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { LLVM_DEBUG(dbgs() << "createNumericReg(" << Index << ", ...)\n"); return CreateReg(Index, Str, RegKind_Numeric, RegInfo, S, E, Parser); } /// Create a register that is definitely a GPR. /// This is typically only used for named registers such as $gp. static std::unique_ptr createGPRReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { return CreateReg(Index, Str, RegKind_GPR, RegInfo, S, E, Parser); } /// Create a register that is definitely a FGR. /// This is typically only used for named registers such as $f0. static std::unique_ptr createFGRReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { return CreateReg(Index, Str, RegKind_FGR, RegInfo, S, E, Parser); } /// Create a register that is definitely a HWReg. /// This is typically only used for named registers such as $hwr_cpunum. static std::unique_ptr createHWRegsReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { return CreateReg(Index, Str, RegKind_HWRegs, RegInfo, S, E, Parser); } /// Create a register that is definitely an FCC. /// This is typically only used for named registers such as $fcc0. static std::unique_ptr createFCCReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { return CreateReg(Index, Str, RegKind_FCC, RegInfo, S, E, Parser); } /// Create a register that is definitely an ACC. /// This is typically only used for named registers such as $ac0. static std::unique_ptr createACCReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { return CreateReg(Index, Str, RegKind_ACC, RegInfo, S, E, Parser); } /// Create a register that is definitely an MSA128. /// This is typically only used for named registers such as $w0. static std::unique_ptr createMSA128Reg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { return CreateReg(Index, Str, RegKind_MSA128, RegInfo, S, E, Parser); } /// Create a register that is definitely an MSACtrl. /// This is typically only used for named registers such as $msaaccess. static std::unique_ptr createMSACtrlReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E, MipsAsmParser &Parser) { return CreateReg(Index, Str, RegKind_MSACtrl, RegInfo, S, E, Parser); } static std::unique_ptr CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) { auto Op = llvm::make_unique(k_Immediate, Parser); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; return Op; } static std::unique_ptr CreateMem(std::unique_ptr Base, const MCExpr *Off, SMLoc S, SMLoc E, MipsAsmParser &Parser) { auto Op = llvm::make_unique(k_Memory, Parser); Op->Mem.Base = Base.release(); Op->Mem.Off = Off; Op->StartLoc = S; Op->EndLoc = E; return Op; } static std::unique_ptr CreateRegList(SmallVectorImpl &Regs, SMLoc StartLoc, SMLoc EndLoc, MipsAsmParser &Parser) { assert(Regs.size() > 0 && "Empty list not allowed"); auto Op = llvm::make_unique(k_RegList, Parser); Op->RegList.List = new SmallVector(Regs.begin(), Regs.end()); Op->StartLoc = StartLoc; Op->EndLoc = EndLoc; return Op; } bool isGPRZeroAsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index == 0; } bool isGPRNonZeroAsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index > 0 && RegIdx.Index <= 31; } bool isGPRAsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31; } bool isMM16AsmReg() const { if (!(isRegIdx() && RegIdx.Kind)) return false; return ((RegIdx.Index >= 2 && RegIdx.Index <= 7) || RegIdx.Index == 16 || RegIdx.Index == 17); } bool isMM16AsmRegZero() const { if (!(isRegIdx() && RegIdx.Kind)) return false; return (RegIdx.Index == 0 || (RegIdx.Index >= 2 && RegIdx.Index <= 7) || RegIdx.Index == 17); } bool isMM16AsmRegMoveP() const { if (!(isRegIdx() && RegIdx.Kind)) return false; return (RegIdx.Index == 0 || (RegIdx.Index >= 2 && RegIdx.Index <= 3) || (RegIdx.Index >= 16 && RegIdx.Index <= 20)); } bool isMM16AsmRegMovePPairFirst() const { if (!(isRegIdx() && RegIdx.Kind)) return false; return RegIdx.Index >= 4 && RegIdx.Index <= 6; } bool isMM16AsmRegMovePPairSecond() const { if (!(isRegIdx() && RegIdx.Kind)) return false; return (RegIdx.Index == 21 || RegIdx.Index == 22 || (RegIdx.Index >= 5 && RegIdx.Index <= 7)); } bool isFGRAsmReg() const { // AFGR64 is $0-$15 but we handle this in getAFGR64() return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31; } bool isStrictlyFGRAsmReg() const { // AFGR64 is $0-$15 but we handle this in getAFGR64() return isRegIdx() && RegIdx.Kind == RegKind_FGR && RegIdx.Index <= 31; } bool isHWRegsAsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_HWRegs && RegIdx.Index <= 31; } bool isCCRAsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31; } bool isFCCAsmReg() const { if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC)) return false; return RegIdx.Index <= 7; } bool isACCAsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3; } bool isCOP0AsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_COP0 && RegIdx.Index <= 31; } bool isCOP2AsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31; } bool isCOP3AsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_COP3 && RegIdx.Index <= 31; } bool isMSA128AsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31; } bool isMSACtrlAsmReg() const { return isRegIdx() && RegIdx.Kind & RegKind_MSACtrl && RegIdx.Index <= 7; } /// getStartLoc - Get the location of the first token of this operand. SMLoc getStartLoc() const override { return StartLoc; } /// getEndLoc - Get the location of the last token of this operand. SMLoc getEndLoc() const override { return EndLoc; } void print(raw_ostream &OS) const override { switch (Kind) { case k_Immediate: OS << "Imm<"; OS << *Imm.Val; OS << ">"; break; case k_Memory: OS << "Mem<"; Mem.Base->print(OS); OS << ", "; OS << *Mem.Off; OS << ">"; break; case k_RegisterIndex: OS << "RegIdx<" << RegIdx.Index << ":" << RegIdx.Kind << ", " << StringRef(RegIdx.Tok.Data, RegIdx.Tok.Length) << ">"; break; case k_Token: OS << getToken(); break; case k_RegList: OS << "RegList< "; for (auto Reg : (*RegList.List)) OS << Reg << " "; OS << ">"; break; } } bool isValidForTie(const MipsOperand &Other) const { if (Kind != Other.Kind) return false; switch (Kind) { default: llvm_unreachable("Unexpected kind"); return false; case k_RegisterIndex: { StringRef Token(RegIdx.Tok.Data, RegIdx.Tok.Length); StringRef OtherToken(Other.RegIdx.Tok.Data, Other.RegIdx.Tok.Length); return Token == OtherToken; } } } }; // class MipsOperand } // end anonymous namespace namespace llvm { extern const MCInstrDesc MipsInsts[]; } // end namespace llvm static const MCInstrDesc &getInstDesc(unsigned Opcode) { return MipsInsts[Opcode]; } static bool hasShortDelaySlot(MCInst &Inst) { switch (Inst.getOpcode()) { case Mips::BEQ_MM: case Mips::BNE_MM: case Mips::BLTZ_MM: case Mips::BGEZ_MM: case Mips::BLEZ_MM: case Mips::BGTZ_MM: case Mips::JRC16_MM: case Mips::JALS_MM: case Mips::JALRS_MM: case Mips::JALRS16_MM: case Mips::BGEZALS_MM: case Mips::BLTZALS_MM: return true; case Mips::J_MM: return !Inst.getOperand(0).isReg(); default: return false; } } static const MCSymbol *getSingleMCSymbol(const MCExpr *Expr) { if (const MCSymbolRefExpr *SRExpr = dyn_cast(Expr)) { return &SRExpr->getSymbol(); } if (const MCBinaryExpr *BExpr = dyn_cast(Expr)) { const MCSymbol *LHSSym = getSingleMCSymbol(BExpr->getLHS()); const MCSymbol *RHSSym = getSingleMCSymbol(BExpr->getRHS()); if (LHSSym) return LHSSym; if (RHSSym) return RHSSym; return nullptr; } if (const MCUnaryExpr *UExpr = dyn_cast(Expr)) return getSingleMCSymbol(UExpr->getSubExpr()); return nullptr; } static unsigned countMCSymbolRefExpr(const MCExpr *Expr) { if (isa(Expr)) return 1; if (const MCBinaryExpr *BExpr = dyn_cast(Expr)) return countMCSymbolRefExpr(BExpr->getLHS()) + countMCSymbolRefExpr(BExpr->getRHS()); if (const MCUnaryExpr *UExpr = dyn_cast(Expr)) return countMCSymbolRefExpr(UExpr->getSubExpr()); return 0; } bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode()); bool ExpandedJalSym = false; Inst.setLoc(IDLoc); if (MCID.isBranch() || MCID.isCall()) { const unsigned Opcode = Inst.getOpcode(); MCOperand Offset; switch (Opcode) { default: break; case Mips::BBIT0: case Mips::BBIT032: case Mips::BBIT1: case Mips::BBIT132: assert(hasCnMips() && "instruction only valid for octeon cpus"); LLVM_FALLTHROUGH; case Mips::BEQ: case Mips::BNE: case Mips::BEQ_MM: case Mips::BNE_MM: assert(MCID.getNumOperands() == 3 && "unexpected number of operands"); Offset = Inst.getOperand(2); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 1LL << (inMicroMipsMode() ? 1 : 2))) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BGEZ: case Mips::BGTZ: case Mips::BLEZ: case Mips::BLTZ: case Mips::BGEZAL: case Mips::BLTZAL: case Mips::BC1F: case Mips::BC1T: case Mips::BGEZ_MM: case Mips::BGTZ_MM: case Mips::BLEZ_MM: case Mips::BLTZ_MM: case Mips::BGEZAL_MM: case Mips::BLTZAL_MM: case Mips::BC1F_MM: case Mips::BC1T_MM: case Mips::BC1EQZC_MMR6: case Mips::BC1NEZC_MMR6: case Mips::BC2EQZC_MMR6: case Mips::BC2NEZC_MMR6: assert(MCID.getNumOperands() == 2 && "unexpected number of operands"); Offset = Inst.getOperand(1); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 1LL << (inMicroMipsMode() ? 1 : 2))) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BGEC: case Mips::BGEC_MMR6: case Mips::BLTC: case Mips::BLTC_MMR6: case Mips::BGEUC: case Mips::BGEUC_MMR6: case Mips::BLTUC: case Mips::BLTUC_MMR6: case Mips::BEQC: case Mips::BEQC_MMR6: case Mips::BNEC: case Mips::BNEC_MMR6: assert(MCID.getNumOperands() == 3 && "unexpected number of operands"); Offset = Inst.getOperand(2); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. if (!isIntN(18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 1LL << 2)) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BLEZC: case Mips::BLEZC_MMR6: case Mips::BGEZC: case Mips::BGEZC_MMR6: case Mips::BGTZC: case Mips::BGTZC_MMR6: case Mips::BLTZC: case Mips::BLTZC_MMR6: assert(MCID.getNumOperands() == 2 && "unexpected number of operands"); Offset = Inst.getOperand(1); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. if (!isIntN(18, Offset.getImm())) return Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 1LL << 2)) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BEQZC: case Mips::BEQZC_MMR6: case Mips::BNEZC: case Mips::BNEZC_MMR6: assert(MCID.getNumOperands() == 2 && "unexpected number of operands"); Offset = Inst.getOperand(1); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. if (!isIntN(23, Offset.getImm())) return Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 1LL << 2)) return Error(IDLoc, "branch to misaligned address"); break; case Mips::BEQZ16_MM: case Mips::BEQZC16_MMR6: case Mips::BNEZ16_MM: case Mips::BNEZC16_MMR6: assert(MCID.getNumOperands() == 2 && "unexpected number of operands"); Offset = Inst.getOperand(1); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. if (!isInt<8>(Offset.getImm())) return Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 2LL)) return Error(IDLoc, "branch to misaligned address"); break; } } // SSNOP is deprecated on MIPS32r6/MIPS64r6 // We still accept it but it is a normal nop. if (hasMips32r6() && Inst.getOpcode() == Mips::SSNOP) { std::string ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6"; Warning(IDLoc, "ssnop is deprecated for " + ISA + " and is equivalent to a " "nop instruction"); } if (hasCnMips()) { const unsigned Opcode = Inst.getOpcode(); MCOperand Opnd; int Imm; switch (Opcode) { default: break; case Mips::BBIT0: case Mips::BBIT032: case Mips::BBIT1: case Mips::BBIT132: assert(MCID.getNumOperands() == 3 && "unexpected number of operands"); // The offset is handled above Opnd = Inst.getOperand(1); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (Imm < 0 || Imm > (Opcode == Mips::BBIT0 || Opcode == Mips::BBIT1 ? 63 : 31)) return Error(IDLoc, "immediate operand value out of range"); if (Imm > 31) { Inst.setOpcode(Opcode == Mips::BBIT0 ? Mips::BBIT032 : Mips::BBIT132); Inst.getOperand(1).setImm(Imm - 32); } break; case Mips::SEQi: case Mips::SNEi: assert(MCID.getNumOperands() == 3 && "unexpected number of operands"); Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (!isInt<10>(Imm)) return Error(IDLoc, "immediate operand value out of range"); break; } } // Warn on division by zero. We're checking here as all instructions get // processed here, not just the macros that need expansion. // // The MIPS backend models most of the divison instructions and macros as // three operand instructions. The pre-R6 divide instructions however have // two operands and explicitly define HI/LO as part of the instruction, // not in the operands. unsigned FirstOp = 1; unsigned SecondOp = 2; switch (Inst.getOpcode()) { default: break; case Mips::SDivIMacro: case Mips::UDivIMacro: case Mips::DSDivIMacro: case Mips::DUDivIMacro: if (Inst.getOperand(2).getImm() == 0) { if (Inst.getOperand(1).getReg() == Mips::ZERO || Inst.getOperand(1).getReg() == Mips::ZERO_64) Warning(IDLoc, "dividing zero by zero"); else Warning(IDLoc, "division by zero"); } break; case Mips::DSDIV: case Mips::SDIV: case Mips::UDIV: case Mips::DUDIV: case Mips::UDIV_MM: case Mips::SDIV_MM: FirstOp = 0; SecondOp = 1; LLVM_FALLTHROUGH; case Mips::SDivMacro: case Mips::DSDivMacro: case Mips::UDivMacro: case Mips::DUDivMacro: case Mips::DIV: case Mips::DIVU: case Mips::DDIV: case Mips::DDIVU: case Mips::DIVU_MMR6: case Mips::DIV_MMR6: if (Inst.getOperand(SecondOp).getReg() == Mips::ZERO || Inst.getOperand(SecondOp).getReg() == Mips::ZERO_64) { if (Inst.getOperand(FirstOp).getReg() == Mips::ZERO || Inst.getOperand(FirstOp).getReg() == Mips::ZERO_64) Warning(IDLoc, "dividing zero by zero"); else Warning(IDLoc, "division by zero"); } break; } // For PIC code convert unconditional jump to unconditional branch. if ((Inst.getOpcode() == Mips::J || Inst.getOpcode() == Mips::J_MM) && inPicMode()) { MCInst BInst; BInst.setOpcode(inMicroMipsMode() ? Mips::BEQ_MM : Mips::BEQ); BInst.addOperand(MCOperand::createReg(Mips::ZERO)); BInst.addOperand(MCOperand::createReg(Mips::ZERO)); BInst.addOperand(Inst.getOperand(0)); Inst = BInst; } // This expansion is not in a function called by tryExpandInstruction() // because the pseudo-instruction doesn't have a distinct opcode. if ((Inst.getOpcode() == Mips::JAL || Inst.getOpcode() == Mips::JAL_MM) && inPicMode()) { warnIfNoMacro(IDLoc); const MCExpr *JalExpr = Inst.getOperand(0).getExpr(); // We can do this expansion if there's only 1 symbol in the argument // expression. if (countMCSymbolRefExpr(JalExpr) > 1) return Error(IDLoc, "jal doesn't support multiple symbols in PIC mode"); // FIXME: This is checking the expression can be handled by the later stages // of the assembler. We ought to leave it to those later stages. const MCSymbol *JalSym = getSingleMCSymbol(JalExpr); // FIXME: Add support for label+offset operands (currently causes an error). // FIXME: Add support for forward-declared local symbols. // FIXME: Add expansion for when the LargeGOT option is enabled. if (JalSym->isInSection() || JalSym->isTemporary() || (JalSym->isELF() && cast(JalSym)->getBinding() == ELF::STB_LOCAL)) { if (isABI_O32()) { // If it's a local symbol and the O32 ABI is being used, we expand to: // lw $25, 0($gp) // R_(MICRO)MIPS_GOT16 label // addiu $25, $25, 0 // R_(MICRO)MIPS_LO16 label // jalr $25 const MCExpr *Got16RelocExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT, JalExpr, getContext()); const MCExpr *Lo16RelocExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, JalExpr, getContext()); TOut.emitRRX(Mips::LW, Mips::T9, GPReg, MCOperand::createExpr(Got16RelocExpr), IDLoc, STI); TOut.emitRRX(Mips::ADDiu, Mips::T9, Mips::T9, MCOperand::createExpr(Lo16RelocExpr), IDLoc, STI); } else if (isABI_N32() || isABI_N64()) { // If it's a local symbol and the N32/N64 ABIs are being used, // we expand to: // lw/ld $25, 0($gp) // R_(MICRO)MIPS_GOT_DISP label // jalr $25 const MCExpr *GotDispRelocExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, JalExpr, getContext()); TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, GPReg, MCOperand::createExpr(GotDispRelocExpr), IDLoc, STI); } } else { // If it's an external/weak symbol, we expand to: // lw/ld $25, 0($gp) // R_(MICRO)MIPS_CALL16 label // jalr $25 const MCExpr *Call16RelocExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, JalExpr, getContext()); TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, GPReg, MCOperand::createExpr(Call16RelocExpr), IDLoc, STI); } MCInst JalrInst; if (IsCpRestoreSet && inMicroMipsMode()) JalrInst.setOpcode(Mips::JALRS_MM); else JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR); JalrInst.addOperand(MCOperand::createReg(Mips::RA)); JalrInst.addOperand(MCOperand::createReg(Mips::T9)); if (EmitJalrReloc) { // As an optimization hint for the linker, before the JALR we add: // .reloc tmplabel, R_{MICRO}MIPS_JALR, symbol // tmplabel: MCSymbol *TmpLabel = getContext().createTempSymbol(); const MCExpr *TmpExpr = MCSymbolRefExpr::create(TmpLabel, getContext()); const MCExpr *RelocJalrExpr = MCSymbolRefExpr::create(JalSym, MCSymbolRefExpr::VK_None, getContext(), IDLoc); TOut.getStreamer().EmitRelocDirective(*TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", RelocJalrExpr, IDLoc, *STI); TOut.getStreamer().EmitLabel(TmpLabel); } Inst = JalrInst; ExpandedJalSym = true; } bool IsPCRelativeLoad = (MCID.TSFlags & MipsII::IsPCRelativeLoad) != 0; if ((MCID.mayLoad() || MCID.mayStore()) && !IsPCRelativeLoad) { // Check the offset of memory operand, if it is a symbol // reference or immediate we may have to expand instructions. for (unsigned i = 0; i < MCID.getNumOperands(); i++) { const MCOperandInfo &OpInfo = MCID.OpInfo[i]; if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) || (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) { MCOperand &Op = Inst.getOperand(i); if (Op.isImm()) { int64_t MemOffset = Op.getImm(); if (MemOffset < -32768 || MemOffset > 32767) { // Offset can't exceed 16bit value. expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad()); return getParser().hasPendingError(); } } else if (Op.isExpr()) { const MCExpr *Expr = Op.getExpr(); if (Expr->getKind() == MCExpr::SymbolRef) { const MCSymbolRefExpr *SR = static_cast(Expr); if (SR->getKind() == MCSymbolRefExpr::VK_None) { // Expand symbol. expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad()); return getParser().hasPendingError(); } } else if (!isEvaluated(Expr)) { expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad()); return getParser().hasPendingError(); } } } } // for } // if load/store if (inMicroMipsMode()) { if (MCID.mayLoad() && Inst.getOpcode() != Mips::LWP_MM) { // Try to create 16-bit GP relative load instruction. for (unsigned i = 0; i < MCID.getNumOperands(); i++) { const MCOperandInfo &OpInfo = MCID.OpInfo[i]; if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) || (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) { MCOperand &Op = Inst.getOperand(i); if (Op.isImm()) { int MemOffset = Op.getImm(); MCOperand &DstReg = Inst.getOperand(0); MCOperand &BaseReg = Inst.getOperand(1); if (isInt<9>(MemOffset) && (MemOffset % 4 == 0) && getContext().getRegisterInfo()->getRegClass( Mips::GPRMM16RegClassID).contains(DstReg.getReg()) && (BaseReg.getReg() == Mips::GP || BaseReg.getReg() == Mips::GP_64)) { TOut.emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset, IDLoc, STI); return false; } } } } // for } // if load // TODO: Handle this with the AsmOperandClass.PredicateMethod. MCOperand Opnd; int Imm; switch (Inst.getOpcode()) { default: break; case Mips::ADDIUSP_MM: Opnd = Inst.getOperand(0); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (Imm < -1032 || Imm > 1028 || (Imm < 8 && Imm > -12) || Imm % 4 != 0) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::SLL16_MM: case Mips::SRL16_MM: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (Imm < 1 || Imm > 8) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::LI16_MM: Opnd = Inst.getOperand(1); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (Imm < -1 || Imm > 126) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::ADDIUR2_MM: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (!(Imm == 1 || Imm == -1 || ((Imm % 4 == 0) && Imm < 28 && Imm > 0))) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::ANDI16_MM: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (!(Imm == 128 || (Imm >= 1 && Imm <= 4) || Imm == 7 || Imm == 8 || Imm == 15 || Imm == 16 || Imm == 31 || Imm == 32 || Imm == 63 || Imm == 64 || Imm == 255 || Imm == 32768 || Imm == 65535)) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::LBU16_MM: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (Imm < -1 || Imm > 14) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::SB16_MM: case Mips::SB16_MMR6: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (Imm < 0 || Imm > 15) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::LHU16_MM: case Mips::SH16_MM: case Mips::SH16_MMR6: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (Imm < 0 || Imm > 30 || (Imm % 2 != 0)) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::LW16_MM: case Mips::SW16_MM: case Mips::SW16_MMR6: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if (Imm < 0 || Imm > 60 || (Imm % 4 != 0)) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::ADDIUPC_MM: Opnd = Inst.getOperand(1); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); Imm = Opnd.getImm(); if ((Imm % 4 != 0) || !isInt<25>(Imm)) return Error(IDLoc, "immediate operand value out of range"); break; case Mips::LWP_MM: case Mips::SWP_MM: if (Inst.getOperand(0).getReg() == Mips::RA) return Error(IDLoc, "invalid operand for instruction"); break; case Mips::MOVEP_MM: case Mips::MOVEP_MMR6: { unsigned R0 = Inst.getOperand(0).getReg(); unsigned R1 = Inst.getOperand(1).getReg(); bool RegPair = ((R0 == Mips::A1 && R1 == Mips::A2) || (R0 == Mips::A1 && R1 == Mips::A3) || (R0 == Mips::A2 && R1 == Mips::A3) || (R0 == Mips::A0 && R1 == Mips::S5) || (R0 == Mips::A0 && R1 == Mips::S6) || (R0 == Mips::A0 && R1 == Mips::A1) || (R0 == Mips::A0 && R1 == Mips::A2) || (R0 == Mips::A0 && R1 == Mips::A3)); if (!RegPair) return Error(IDLoc, "invalid operand for instruction"); break; } } } bool FillDelaySlot = MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder(); if (FillDelaySlot) TOut.emitDirectiveSetNoReorder(); MacroExpanderResultTy ExpandResult = tryExpandInstruction(Inst, IDLoc, Out, STI); switch (ExpandResult) { case MER_NotAMacro: Out.EmitInstruction(Inst, *STI); break; case MER_Success: break; case MER_Fail: return true; } // We know we emitted an instruction on the MER_NotAMacro or MER_Success path. // If we're in microMIPS mode then we must also set EF_MIPS_MICROMIPS. if (inMicroMipsMode()) { TOut.setUsesMicroMips(); TOut.updateABIInfo(*this); } // If this instruction has a delay slot and .set reorder is active, // emit a NOP after it. if (FillDelaySlot) { TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst), IDLoc, STI); TOut.emitDirectiveSetReorder(); } if ((Inst.getOpcode() == Mips::JalOneReg || Inst.getOpcode() == Mips::JalTwoReg || ExpandedJalSym) && isPicAndNotNxxAbi()) { if (IsCpRestoreSet) { // We need a NOP between the JALR and the LW: // If .set reorder has been used, we've already emitted a NOP. // If .set noreorder has been used, we need to emit a NOP at this point. if (!AssemblerOptions.back()->isReorder()) TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst), IDLoc, STI); // Load the $gp from the stack. TOut.emitGPRestore(CpRestoreOffset, IDLoc, STI); } else Warning(IDLoc, "no .cprestore used in PIC mode"); } return false; } MipsAsmParser::MacroExpanderResultTy MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { switch (Inst.getOpcode()) { default: return MER_NotAMacro; case Mips::LoadImm32: return expandLoadImm(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::LoadImm64: return expandLoadImm(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::LoadAddrImm32: case Mips::LoadAddrImm64: assert(Inst.getOperand(0).isReg() && "expected register operand kind"); assert((Inst.getOperand(1).isImm() || Inst.getOperand(1).isExpr()) && "expected immediate operand kind"); return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister, Inst.getOperand(1), Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::LoadAddrReg32: case Mips::LoadAddrReg64: assert(Inst.getOperand(0).isReg() && "expected register operand kind"); assert(Inst.getOperand(1).isReg() && "expected register operand kind"); assert((Inst.getOperand(2).isImm() || Inst.getOperand(2).isExpr()) && "expected immediate operand kind"); return expandLoadAddress(Inst.getOperand(0).getReg(), Inst.getOperand(1).getReg(), Inst.getOperand(2), Inst.getOpcode() == Mips::LoadAddrReg32, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::B_MM_Pseudo: case Mips::B_MMR6_Pseudo: return expandUncondBranchMMPseudo(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SWM_MM: case Mips::LWM_MM: return expandLoadStoreMultiple(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::JalOneReg: case Mips::JalTwoReg: return expandJalWithRegs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::BneImm: case Mips::BeqImm: case Mips::BEQLImmMacro: case Mips::BNELImmMacro: return expandBranchImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::BLT: case Mips::BLE: case Mips::BGE: case Mips::BGT: case Mips::BLTU: case Mips::BLEU: case Mips::BGEU: case Mips::BGTU: case Mips::BLTL: case Mips::BLEL: case Mips::BGEL: case Mips::BGTL: case Mips::BLTUL: case Mips::BLEUL: case Mips::BGEUL: case Mips::BGTUL: case Mips::BLTImmMacro: case Mips::BLEImmMacro: case Mips::BGEImmMacro: case Mips::BGTImmMacro: case Mips::BLTUImmMacro: case Mips::BLEUImmMacro: case Mips::BGEUImmMacro: case Mips::BGTUImmMacro: case Mips::BLTLImmMacro: case Mips::BLELImmMacro: case Mips::BGELImmMacro: case Mips::BGTLImmMacro: case Mips::BLTULImmMacro: case Mips::BLEULImmMacro: case Mips::BGEULImmMacro: case Mips::BGTULImmMacro: return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SDivMacro: case Mips::SDivIMacro: case Mips::SRemMacro: case Mips::SRemIMacro: return expandDivRem(Inst, IDLoc, Out, STI, false, true) ? MER_Fail : MER_Success; case Mips::DSDivMacro: case Mips::DSDivIMacro: case Mips::DSRemMacro: case Mips::DSRemIMacro: return expandDivRem(Inst, IDLoc, Out, STI, true, true) ? MER_Fail : MER_Success; case Mips::UDivMacro: case Mips::UDivIMacro: case Mips::URemMacro: case Mips::URemIMacro: return expandDivRem(Inst, IDLoc, Out, STI, false, false) ? MER_Fail : MER_Success; case Mips::DUDivMacro: case Mips::DUDivIMacro: case Mips::DURemMacro: case Mips::DURemIMacro: return expandDivRem(Inst, IDLoc, Out, STI, true, false) ? MER_Fail : MER_Success; case Mips::PseudoTRUNC_W_S: return expandTrunc(Inst, false, false, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::PseudoTRUNC_W_D32: return expandTrunc(Inst, true, false, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::PseudoTRUNC_W_D: return expandTrunc(Inst, true, true, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::LoadImmSingleGPR: return expandLoadImmReal(Inst, true, true, false, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::LoadImmSingleFGR: return expandLoadImmReal(Inst, true, false, false, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::LoadImmDoubleGPR: return expandLoadImmReal(Inst, false, true, false, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::LoadImmDoubleFGR: return expandLoadImmReal(Inst, false, false, true, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::LoadImmDoubleFGR_32: return expandLoadImmReal(Inst, false, false, false, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::Ulh: return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::Ulhu: return expandUlh(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::Ush: return expandUsh(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::Ulw: case Mips::Usw: return expandUxw(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::NORImm: case Mips::NORImm64: return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SGE: case Mips::SGEU: return expandSge(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SGEImm: case Mips::SGEUImm: case Mips::SGEImm64: case Mips::SGEUImm64: return expandSgeImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SGTImm: case Mips::SGTUImm: case Mips::SGTImm64: case Mips::SGTUImm64: return expandSgtImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SLTImm64: if (isInt<16>(Inst.getOperand(2).getImm())) { Inst.setOpcode(Mips::SLTi64); return MER_NotAMacro; } return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SLTUImm64: if (isInt<16>(Inst.getOperand(2).getImm())) { Inst.setOpcode(Mips::SLTiu64); return MER_NotAMacro; } return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::ADDi: case Mips::ADDi_MM: case Mips::ADDiu: case Mips::ADDiu_MM: case Mips::SLTi: case Mips::SLTi_MM: case Mips::SLTiu: case Mips::SLTiu_MM: if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) { int64_t ImmValue = Inst.getOperand(2).getImm(); if (isInt<16>(ImmValue)) return MER_NotAMacro; return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; } return MER_NotAMacro; case Mips::ANDi: case Mips::ANDi_MM: case Mips::ANDi64: case Mips::ORi: case Mips::ORi_MM: case Mips::ORi64: case Mips::XORi: case Mips::XORi_MM: case Mips::XORi64: if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) { int64_t ImmValue = Inst.getOperand(2).getImm(); if (isUInt<16>(ImmValue)) return MER_NotAMacro; return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; } return MER_NotAMacro; case Mips::ROL: case Mips::ROR: return expandRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::ROLImm: case Mips::RORImm: return expandRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::DROL: case Mips::DROR: return expandDRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::DROLImm: case Mips::DRORImm: return expandDRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::ABSMacro: return expandAbs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::MULImmMacro: case Mips::DMULImmMacro: return expandMulImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::MULOMacro: case Mips::DMULOMacro: return expandMulO(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::MULOUMacro: case Mips::DMULOUMacro: return expandMulOU(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::DMULMacro: return expandDMULMacro(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::LDMacro: case Mips::SDMacro: return expandLoadStoreDMacro(Inst, IDLoc, Out, STI, Inst.getOpcode() == Mips::LDMacro) ? MER_Fail : MER_Success; case Mips::SDC1_M1: return expandStoreDM1Macro(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SEQMacro: return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SEQIMacro: return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::MFTC0: case Mips::MTTC0: case Mips::MFTGPR: case Mips::MTTGPR: case Mips::MFTLO: case Mips::MTTLO: case Mips::MFTHI: case Mips::MTTHI: case Mips::MFTACX: case Mips::MTTACX: case Mips::MFTDSP: case Mips::MTTDSP: case Mips::MFTC1: case Mips::MTTC1: case Mips::MFTHC1: case Mips::MTTHC1: case Mips::CFTC1: case Mips::CTTC1: return expandMXTRAlias(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; } } bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); // Create a JALR instruction which is going to replace the pseudo-JAL. MCInst JalrInst; JalrInst.setLoc(IDLoc); const MCOperand FirstRegOp = Inst.getOperand(0); const unsigned Opcode = Inst.getOpcode(); if (Opcode == Mips::JalOneReg) { // jal $rs => jalr $rs if (IsCpRestoreSet && inMicroMipsMode()) { JalrInst.setOpcode(Mips::JALRS16_MM); JalrInst.addOperand(FirstRegOp); } else if (inMicroMipsMode()) { JalrInst.setOpcode(hasMips32r6() ? Mips::JALRC16_MMR6 : Mips::JALR16_MM); JalrInst.addOperand(FirstRegOp); } else { JalrInst.setOpcode(Mips::JALR); JalrInst.addOperand(MCOperand::createReg(Mips::RA)); JalrInst.addOperand(FirstRegOp); } } else if (Opcode == Mips::JalTwoReg) { // jal $rd, $rs => jalr $rd, $rs if (IsCpRestoreSet && inMicroMipsMode()) JalrInst.setOpcode(Mips::JALRS_MM); else JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR); JalrInst.addOperand(FirstRegOp); const MCOperand SecondRegOp = Inst.getOperand(1); JalrInst.addOperand(SecondRegOp); } Out.EmitInstruction(JalrInst, *STI); // If .set reorder is active and branch instruction has a delay slot, // emit a NOP after it. const MCInstrDesc &MCID = getInstDesc(JalrInst.getOpcode()); if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) TOut.emitEmptyDelaySlot(hasShortDelaySlot(JalrInst), IDLoc, STI); return false; } /// Can the value be represented by a unsigned N-bit value and a shift left? template static bool isShiftedUIntAtAnyPosition(uint64_t x) { unsigned BitNum = findFirstSet(x); return (x == x >> BitNum << BitNum) && isUInt(x >> BitNum); } /// Load (or add) an immediate into a register. /// /// @param ImmValue The immediate to load. /// @param DstReg The register that will hold the immediate. /// @param SrcReg A register to add to the immediate or Mips::NoRegister /// for a simple initialization. /// @param Is32BitImm Is ImmValue 32-bit or 64-bit? /// @param IsAddress True if the immediate represents an address. False if it /// is an integer. /// @param IDLoc Location of the immediate in the source file. bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg, bool Is32BitImm, bool IsAddress, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); if (!Is32BitImm && !isGP64bit()) { Error(IDLoc, "instruction requires a 64-bit architecture"); return true; } if (Is32BitImm) { if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) { // Sign extend up to 64-bit so that the predicates match the hardware // behaviour. In particular, isInt<16>(0xffff8000) and similar should be // true. ImmValue = SignExtend64<32>(ImmValue); } else { Error(IDLoc, "instruction requires a 32-bit immediate"); return true; } } unsigned ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg(); unsigned AdduOp = !Is32BitImm ? Mips::DADDu : Mips::ADDu; bool UseSrcReg = false; if (SrcReg != Mips::NoRegister) UseSrcReg = true; unsigned TmpReg = DstReg; if (UseSrcReg && getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) { // At this point we need AT to perform the expansions and we exit if it is // not available. unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; TmpReg = ATReg; } if (isInt<16>(ImmValue)) { if (!UseSrcReg) SrcReg = ZeroReg; // This doesn't quite follow the usual ABI expectations for N32 but matches // traditional assembler behaviour. N32 would normally use addiu for both // integers and addresses. if (IsAddress && !Is32BitImm) { TOut.emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI); return false; } TOut.emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI); return false; } if (isUInt<16>(ImmValue)) { unsigned TmpReg = DstReg; if (SrcReg == DstReg) { TmpReg = getATReg(IDLoc); if (!TmpReg) return true; } TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, STI); if (UseSrcReg) TOut.emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, STI); return false; } if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) { warnIfNoMacro(IDLoc); uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff; uint16_t Bits15To0 = ImmValue & 0xffff; if (!Is32BitImm && !isInt<32>(ImmValue)) { // Traditional behaviour seems to special case this particular value. It's // not clear why other masks are handled differently. if (ImmValue == 0xffffffff) { TOut.emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, STI); TOut.emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, STI); if (UseSrcReg) TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI); return false; } // Expand to an ORi instead of a LUi to avoid sign-extending into the // upper 32 bits. TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, STI); TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, STI); if (Bits15To0) TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI); if (UseSrcReg) TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI); return false; } TOut.emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, STI); if (Bits15To0) TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI); if (UseSrcReg) TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI); return false; } if (isShiftedUIntAtAnyPosition<16>(ImmValue)) { if (Is32BitImm) { Error(IDLoc, "instruction requires a 32-bit immediate"); return true; } // Traditionally, these immediates are shifted as little as possible and as // such we align the most significant bit to bit 15 of our temporary. unsigned FirstSet = findFirstSet((uint64_t)ImmValue); unsigned LastSet = findLastSet((uint64_t)ImmValue); unsigned ShiftAmount = FirstSet - (15 - (LastSet - FirstSet)); uint16_t Bits = (ImmValue >> ShiftAmount) & 0xffff; TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, STI); TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, STI); if (UseSrcReg) TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI); return false; } warnIfNoMacro(IDLoc); // The remaining case is packed with a sequence of dsll and ori with zeros // being omitted and any neighbouring dsll's being coalesced. // The highest 32-bit's are equivalent to a 32-bit immediate load. // Load bits 32-63 of ImmValue into bits 0-31 of the temporary register. if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false, IDLoc, Out, STI)) return false; // Shift and accumulate into the register. If a 16-bit chunk is zero, then // skip it and defer the shift to the next chunk. unsigned ShiftCarriedForwards = 16; for (int BitNum = 16; BitNum >= 0; BitNum -= 16) { uint16_t ImmChunk = (ImmValue >> BitNum) & 0xffff; if (ImmChunk != 0) { TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI); TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, STI); ShiftCarriedForwards = 0; } ShiftCarriedForwards += 16; } ShiftCarriedForwards -= 16; // Finish any remaining shifts left by trailing zeros. if (ShiftCarriedForwards) TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI); if (UseSrcReg) TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI); return false; } bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { const MCOperand &ImmOp = Inst.getOperand(1); assert(ImmOp.isImm() && "expected immediate operand kind"); const MCOperand &DstRegOp = Inst.getOperand(0); assert(DstRegOp.isReg() && "expected register operand kind"); if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister, Is32BitImm, false, IDLoc, Out, STI)) return true; return false; } bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg, const MCOperand &Offset, bool Is32BitAddress, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { // la can't produce a usable address when addresses are 64-bit. if (Is32BitAddress && ABI.ArePtrs64bit()) { // FIXME: Demote this to a warning and continue as if we had 'dla' instead. // We currently can't do this because we depend on the equality // operator and N64 can end up with a GPR32/GPR64 mismatch. Error(IDLoc, "la used to load 64-bit address"); // Continue as if we had 'dla' instead. Is32BitAddress = false; return true; } // dla requires 64-bit addresses. if (!Is32BitAddress && !hasMips3()) { Error(IDLoc, "instruction requires a 64-bit architecture"); return true; } if (!Offset.isImm()) return loadAndAddSymbolAddress(Offset.getExpr(), DstReg, BaseReg, Is32BitAddress, IDLoc, Out, STI); if (!ABI.ArePtrs64bit()) { // Continue as if we had 'la' whether we had 'la' or 'dla'. Is32BitAddress = true; } return loadImmediate(Offset.getImm(), DstReg, BaseReg, Is32BitAddress, true, IDLoc, Out, STI); } bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg, unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { // FIXME: These expansions do not respect -mxgot. MipsTargetStreamer &TOut = getTargetStreamer(); bool UseSrcReg = SrcReg != Mips::NoRegister; warnIfNoMacro(IDLoc); if (inPicMode() && ABI.IsO32()) { MCValue Res; if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) { Error(IDLoc, "expected relocatable expression"); return true; } if (Res.getSymB() != nullptr) { Error(IDLoc, "expected relocatable expression with only one symbol"); return true; } // The case where the result register is $25 is somewhat special. If the // symbol in the final relocation is external and not modified with a // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16. if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg && Res.getConstant() == 0 && !(Res.getSymA()->getSymbol().isInSection() || Res.getSymA()->getSymbol().isTemporary() || (Res.getSymA()->getSymbol().isELF() && cast(Res.getSymA()->getSymbol()).getBinding() == ELF::STB_LOCAL))) { const MCExpr *CallExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext()); TOut.emitRRX(Mips::LW, DstReg, GPReg, MCOperand::createExpr(CallExpr), IDLoc, STI); return false; } // The remaining cases are: // External GOT: lw $tmp, %got(symbol+offset)($gp) // >addiu $tmp, $tmp, %lo(offset) // >addiu $rd, $tmp, $rs // Local GOT: lw $tmp, %got(symbol+offset)($gp) // addiu $tmp, $tmp, %lo(symbol+offset)($gp) // >addiu $rd, $tmp, $rs // The addiu's marked with a '>' may be omitted if they are redundant. If // this happens then the last instruction must use $rd as the result // register. const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext()); const MCExpr *LoExpr = nullptr; if (Res.getSymA()->getSymbol().isInSection() || Res.getSymA()->getSymbol().isTemporary()) LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext()); else if (Res.getConstant() != 0) { // External symbols fully resolve the symbol with just the %got(symbol) // but we must still account for any offset to the symbol for expressions // like symbol+8. LoExpr = MCConstantExpr::create(Res.getConstant(), getContext()); } unsigned TmpReg = DstReg; if (UseSrcReg && getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) { // If $rs is the same as $rd, we need to use AT. // If it is not available we exit. unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; TmpReg = ATReg; } TOut.emitRRX(Mips::LW, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, STI); if (LoExpr) TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), IDLoc, STI); if (UseSrcReg) TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI); return false; } if (inPicMode() && ABI.ArePtrs64bit()) { MCValue Res; if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) { Error(IDLoc, "expected relocatable expression"); return true; } if (Res.getSymB() != nullptr) { Error(IDLoc, "expected relocatable expression with only one symbol"); return true; } // The case where the result register is $25 is somewhat special. If the // symbol in the final relocation is external and not modified with a // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT_DISP. if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg && Res.getConstant() == 0 && !(Res.getSymA()->getSymbol().isInSection() || Res.getSymA()->getSymbol().isTemporary() || (Res.getSymA()->getSymbol().isELF() && cast(Res.getSymA()->getSymbol()).getBinding() == ELF::STB_LOCAL))) { const MCExpr *CallExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext()); TOut.emitRRX(Mips::LD, DstReg, GPReg, MCOperand::createExpr(CallExpr), IDLoc, STI); return false; } // The remaining cases are: // Small offset: ld $tmp, %got_disp(symbol)($gp) // >daddiu $tmp, $tmp, offset // >daddu $rd, $tmp, $rs // The daddiu's marked with a '>' may be omitted if they are redundant. If // this happens then the last instruction must use $rd as the result // register. const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, Res.getSymA(), getContext()); const MCExpr *LoExpr = nullptr; if (Res.getConstant() != 0) { // Symbols fully resolve with just the %got_disp(symbol) but we // must still account for any offset to the symbol for // expressions like symbol+8. LoExpr = MCConstantExpr::create(Res.getConstant(), getContext()); // FIXME: Offsets greater than 16 bits are not yet implemented. // FIXME: The correct range is a 32-bit sign-extended number. if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) { Error(IDLoc, "macro instruction uses large offset, which is not " "currently supported"); return true; } } unsigned TmpReg = DstReg; if (UseSrcReg && getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) { // If $rs is the same as $rd, we need to use AT. // If it is not available we exit. unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; TmpReg = ATReg; } TOut.emitRRX(Mips::LD, TmpReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, STI); if (LoExpr) TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), IDLoc, STI); if (UseSrcReg) TOut.emitRRR(Mips::DADDu, DstReg, TmpReg, SrcReg, IDLoc, STI); return false; } const MipsMCExpr *HiExpr = MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext()); const MipsMCExpr *LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext()); // This is the 64-bit symbol address expansion. if (ABI.ArePtrs64bit() && isGP64bit()) { // We need AT for the 64-bit expansion in the cases where the optional // source register is the destination register and for the superscalar // scheduled form. // // If it is not available we exit if the destination is the same as the // source register. const MipsMCExpr *HighestExpr = MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext()); const MipsMCExpr *HigherExpr = MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext()); bool RdRegIsRsReg = getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg); if (canUseATReg() && UseSrcReg && RdRegIsRsReg) { unsigned ATReg = getATReg(IDLoc); // If $rs is the same as $rd: // (d)la $rd, sym($rd) => lui $at, %highest(sym) // daddiu $at, $at, %higher(sym) // dsll $at, $at, 16 // daddiu $at, $at, %hi(sym) // dsll $at, $at, 16 // daddiu $at, $at, %lo(sym) // daddu $rd, $at, $rd TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc, STI); TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HigherExpr), IDLoc, STI); TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI); TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI); TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI); TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, STI); TOut.emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, STI); return false; } else if (canUseATReg() && !RdRegIsRsReg) { unsigned ATReg = getATReg(IDLoc); // If the $rs is different from $rd or if $rs isn't specified and we // have $at available: // (d)la $rd, sym/sym($rs) => lui $rd, %highest(sym) // lui $at, %hi(sym) // daddiu $rd, $rd, %higher(sym) // daddiu $at, $at, %lo(sym) // dsll32 $rd, $rd, 0 // daddu $rd, $rd, $at // (daddu $rd, $rd, $rs) // // Which is preferred for superscalar issue. TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc, STI); TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI); TOut.emitRRX(Mips::DADDiu, DstReg, DstReg, MCOperand::createExpr(HigherExpr), IDLoc, STI); TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, STI); TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI); TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI); if (UseSrcReg) TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI); return false; } else if (!canUseATReg() && !RdRegIsRsReg) { // Otherwise, synthesize the address in the destination register // serially: // (d)la $rd, sym/sym($rs) => lui $rd, %highest(sym) // daddiu $rd, $rd, %higher(sym) // dsll $rd, $rd, 16 // daddiu $rd, $rd, %hi(sym) // dsll $rd, $rd, 16 // daddiu $rd, $rd, %lo(sym) TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc, STI); TOut.emitRRX(Mips::DADDiu, DstReg, DstReg, MCOperand::createExpr(HigherExpr), IDLoc, STI); TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI); TOut.emitRRX(Mips::DADDiu, DstReg, DstReg, MCOperand::createExpr(HiExpr), IDLoc, STI); TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI); TOut.emitRRX(Mips::DADDiu, DstReg, DstReg, MCOperand::createExpr(LoExpr), IDLoc, STI); if (UseSrcReg) TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI); return false; } else { // We have a case where SrcReg == DstReg and we don't have $at // available. We can't expand this case, so error out appropriately. assert(SrcReg == DstReg && !canUseATReg() && "Could have expanded dla but didn't?"); reportParseError(IDLoc, "pseudo-instruction requires $at, which is not available"); return true; } } // And now, the 32-bit symbol address expansion: // If $rs is the same as $rd: // (d)la $rd, sym($rd) => lui $at, %hi(sym) // ori $at, $at, %lo(sym) // addu $rd, $at, $rd // Otherwise, if the $rs is different from $rd or if $rs isn't specified: // (d)la $rd, sym/sym($rs) => lui $rd, %hi(sym) // ori $rd, $rd, %lo(sym) // (addu $rd, $rd, $rs) unsigned TmpReg = DstReg; if (UseSrcReg && getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) { // If $rs is the same as $rd, we need to use AT. // If it is not available we exit. unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; TmpReg = ATReg; } TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, STI); TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), IDLoc, STI); if (UseSrcReg) TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI); else assert( getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, TmpReg)); return false; } // Each double-precision register DO-D15 overlaps with two of the single // precision registers F0-F31. As an example, all of the following hold true: // D0 + 1 == F1, F1 + 1 == D1, F1 + 1 == F2, depending on the context. static unsigned nextReg(unsigned Reg) { if (MipsMCRegisterClasses[Mips::FGR32RegClassID].contains(Reg)) return Reg == (unsigned)Mips::F31 ? (unsigned)Mips::F0 : Reg + 1; switch (Reg) { default: llvm_unreachable("Unknown register in assembly macro expansion!"); case Mips::ZERO: return Mips::AT; case Mips::AT: return Mips::V0; case Mips::V0: return Mips::V1; case Mips::V1: return Mips::A0; case Mips::A0: return Mips::A1; case Mips::A1: return Mips::A2; case Mips::A2: return Mips::A3; case Mips::A3: return Mips::T0; case Mips::T0: return Mips::T1; case Mips::T1: return Mips::T2; case Mips::T2: return Mips::T3; case Mips::T3: return Mips::T4; case Mips::T4: return Mips::T5; case Mips::T5: return Mips::T6; case Mips::T6: return Mips::T7; case Mips::T7: return Mips::S0; case Mips::S0: return Mips::S1; case Mips::S1: return Mips::S2; case Mips::S2: return Mips::S3; case Mips::S3: return Mips::S4; case Mips::S4: return Mips::S5; case Mips::S5: return Mips::S6; case Mips::S6: return Mips::S7; case Mips::S7: return Mips::T8; case Mips::T8: return Mips::T9; case Mips::T9: return Mips::K0; case Mips::K0: return Mips::K1; case Mips::K1: return Mips::GP; case Mips::GP: return Mips::SP; case Mips::SP: return Mips::FP; case Mips::FP: return Mips::RA; case Mips::RA: return Mips::ZERO; case Mips::D0: return Mips::F1; case Mips::D1: return Mips::F3; case Mips::D2: return Mips::F5; case Mips::D3: return Mips::F7; case Mips::D4: return Mips::F9; case Mips::D5: return Mips::F11; case Mips::D6: return Mips::F13; case Mips::D7: return Mips::F15; case Mips::D8: return Mips::F17; case Mips::D9: return Mips::F19; case Mips::D10: return Mips::F21; case Mips::D11: return Mips::F23; case Mips::D12: return Mips::F25; case Mips::D13: return Mips::F27; case Mips::D14: return Mips::F29; case Mips::D15: return Mips::F31; } } // FIXME: This method is too general. In principle we should compute the number // of instructions required to synthesize the immediate inline compared to // synthesizing the address inline and relying on non .text sections. // For static O32 and N32 this may yield a small benefit, for static N64 this is // likely to yield a much larger benefit as we have to synthesize a 64bit // address to load a 64 bit value. bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc, MCSymbol *Sym) { unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; if(IsPicEnabled) { const MCExpr *GotSym = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT, GotSym, getContext()); if(isABI_O32() || isABI_N32()) { TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, STI); } else { //isABI_N64() TOut.emitRRX(Mips::LD, ATReg, GPReg, MCOperand::createExpr(GotExpr), IDLoc, STI); } } else { //!IsPicEnabled const MCExpr *HiSym = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); const MipsMCExpr *HiExpr = MipsMCExpr::create(MipsMCExpr::MEK_HI, HiSym, getContext()); // FIXME: This is technically correct but gives a different result to gas, // but gas is incomplete there (it has a fixme noting it doesn't work with // 64-bit addresses). // FIXME: With -msym32 option, the address expansion for N64 should probably // use the O32 / N32 case. It's safe to use the 64 address expansion as the // symbol's value is considered sign extended. if(isABI_O32() || isABI_N32()) { TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI); } else { //isABI_N64() const MCExpr *HighestSym = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); const MipsMCExpr *HighestExpr = MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, HighestSym, getContext()); const MCExpr *HigherSym = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); const MipsMCExpr *HigherExpr = MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, HigherSym, getContext()); TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc, STI); TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HigherExpr), IDLoc, STI); TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI); TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI); TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI); } } return false; } bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, bool Is64FPU, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(Inst.getNumOperands() == 2 && "Invalid operand count"); assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() && "Invalid instruction operand."); unsigned FirstReg = Inst.getOperand(0).getReg(); uint64_t ImmOp64 = Inst.getOperand(1).getImm(); uint32_t HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32; // If ImmOp64 is AsmToken::Integer type (all bits set to zero in the // exponent field), convert it to double (e.g. 1 to 1.0) if ((HiImmOp64 & 0x7ff00000) == 0) { APFloat RealVal(APFloat::IEEEdouble(), ImmOp64); ImmOp64 = RealVal.bitcastToAPInt().getZExtValue(); } uint32_t LoImmOp64 = ImmOp64 & 0xffffffff; HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32; if (IsSingle) { // Conversion of a double in an uint64_t to a float in a uint32_t, // retaining the bit pattern of a float. uint32_t ImmOp32; double doubleImm = BitsToDouble(ImmOp64); float tmp_float = static_cast(doubleImm); ImmOp32 = FloatToBits(tmp_float); if (IsGPR) { if (loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, true, IDLoc, Out, STI)) return true; return false; } else { unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; if (LoImmOp64 == 0) { if (loadImmediate(ImmOp32, ATReg, Mips::NoRegister, true, true, IDLoc, Out, STI)) return true; TOut.emitRR(Mips::MTC1, FirstReg, ATReg, IDLoc, STI); return false; } MCSection *CS = getStreamer().getCurrentSectionOnly(); // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections // where appropriate. MCSection *ReadOnlySection = getContext().getELFSection( ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); MCSymbol *Sym = getContext().createTempSymbol(); const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); const MipsMCExpr *LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext()); getStreamer().SwitchSection(ReadOnlySection); getStreamer().EmitLabel(Sym, IDLoc); getStreamer().EmitIntValue(ImmOp32, 4); getStreamer().SwitchSection(CS); if(emitPartialAddress(TOut, IDLoc, Sym)) return true; TOut.emitRRX(Mips::LWC1, FirstReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, STI); } return false; } // if(!IsSingle) unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; if (IsGPR) { if (LoImmOp64 == 0) { if(isABI_N32() || isABI_N64()) { if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, false, true, IDLoc, Out, STI)) return true; return false; } else { if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, true, true, IDLoc, Out, STI)) return true; if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, true, IDLoc, Out, STI)) return true; return false; } } MCSection *CS = getStreamer().getCurrentSectionOnly(); MCSection *ReadOnlySection = getContext().getELFSection( ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); MCSymbol *Sym = getContext().createTempSymbol(); const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); const MipsMCExpr *LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext()); getStreamer().SwitchSection(ReadOnlySection); getStreamer().EmitLabel(Sym, IDLoc); getStreamer().EmitIntValue(HiImmOp64, 4); getStreamer().EmitIntValue(LoImmOp64, 4); getStreamer().SwitchSection(CS); if(emitPartialAddress(TOut, IDLoc, Sym)) return true; if(isABI_N64()) TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, STI); else TOut.emitRRX(Mips::ADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, STI); if(isABI_N32() || isABI_N64()) TOut.emitRRI(Mips::LD, FirstReg, ATReg, 0, IDLoc, STI); else { TOut.emitRRI(Mips::LW, FirstReg, ATReg, 0, IDLoc, STI); TOut.emitRRI(Mips::LW, nextReg(FirstReg), ATReg, 4, IDLoc, STI); } return false; } else { // if(!IsGPR && !IsSingle) if ((LoImmOp64 == 0) && !((HiImmOp64 & 0xffff0000) && (HiImmOp64 & 0x0000ffff))) { // FIXME: In the case where the constant is zero, we can load the // register directly from the zero register. if (loadImmediate(HiImmOp64, ATReg, Mips::NoRegister, true, true, IDLoc, Out, STI)) return true; if (isABI_N32() || isABI_N64()) TOut.emitRR(Mips::DMTC1, FirstReg, ATReg, IDLoc, STI); else if (hasMips32r2()) { TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI); TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, ATReg, IDLoc, STI); } else { TOut.emitRR(Mips::MTC1, nextReg(FirstReg), ATReg, IDLoc, STI); TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI); } return false; } MCSection *CS = getStreamer().getCurrentSectionOnly(); // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections // where appropriate. MCSection *ReadOnlySection = getContext().getELFSection( ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); MCSymbol *Sym = getContext().createTempSymbol(); const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); const MipsMCExpr *LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext()); getStreamer().SwitchSection(ReadOnlySection); getStreamer().EmitLabel(Sym, IDLoc); getStreamer().EmitIntValue(HiImmOp64, 4); getStreamer().EmitIntValue(LoImmOp64, 4); getStreamer().SwitchSection(CS); if(emitPartialAddress(TOut, IDLoc, Sym)) return true; TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, STI); } return false; } bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(getInstDesc(Inst.getOpcode()).getNumOperands() == 1 && "unexpected number of operands"); MCOperand Offset = Inst.getOperand(0); if (Offset.isExpr()) { Inst.clear(); Inst.setOpcode(Mips::BEQ_MM); Inst.addOperand(MCOperand::createReg(Mips::ZERO)); Inst.addOperand(MCOperand::createReg(Mips::ZERO)); Inst.addOperand(MCOperand::createExpr(Offset.getExpr())); } else { assert(Offset.isImm() && "expected immediate operand kind"); if (isInt<11>(Offset.getImm())) { // If offset fits into 11 bits then this instruction becomes microMIPS // 16-bit unconditional branch instruction. if (inMicroMipsMode()) Inst.setOpcode(hasMips32r6() ? Mips::BC16_MMR6 : Mips::B16_MM); } else { if (!isInt<17>(Offset.getImm())) return Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 1LL << 1)) return Error(IDLoc, "branch to misaligned address"); Inst.clear(); Inst.setOpcode(Mips::BEQ_MM); Inst.addOperand(MCOperand::createReg(Mips::ZERO)); Inst.addOperand(MCOperand::createReg(Mips::ZERO)); Inst.addOperand(MCOperand::createImm(Offset.getImm())); } } Out.EmitInstruction(Inst, *STI); // If .set reorder is active and branch instruction has a delay slot, // emit a NOP after it. const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode()); if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) TOut.emitEmptyDelaySlot(true, IDLoc, STI); return false; } bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); const MCOperand &DstRegOp = Inst.getOperand(0); assert(DstRegOp.isReg() && "expected register operand kind"); const MCOperand &ImmOp = Inst.getOperand(1); assert(ImmOp.isImm() && "expected immediate operand kind"); const MCOperand &MemOffsetOp = Inst.getOperand(2); assert((MemOffsetOp.isImm() || MemOffsetOp.isExpr()) && "expected immediate or expression operand"); bool IsLikely = false; unsigned OpCode = 0; switch(Inst.getOpcode()) { case Mips::BneImm: OpCode = Mips::BNE; break; case Mips::BeqImm: OpCode = Mips::BEQ; break; case Mips::BEQLImmMacro: OpCode = Mips::BEQL; IsLikely = true; break; case Mips::BNELImmMacro: OpCode = Mips::BNEL; IsLikely = true; break; default: llvm_unreachable("Unknown immediate branch pseudo-instruction."); break; } int64_t ImmValue = ImmOp.getImm(); if (ImmValue == 0) { if (IsLikely) { TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI); TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI); } else TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc, STI); } else { warnIfNoMacro(IDLoc); unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true, IDLoc, Out, STI)) return true; if (IsLikely) { TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI); TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI); } else TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI); } return false; } void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI, bool IsLoad) { const MCOperand &DstRegOp = Inst.getOperand(0); assert(DstRegOp.isReg() && "expected register operand kind"); const MCOperand &BaseRegOp = Inst.getOperand(1); assert(BaseRegOp.isReg() && "expected register operand kind"); const MCOperand &OffsetOp = Inst.getOperand(2); MipsTargetStreamer &TOut = getTargetStreamer(); unsigned DstReg = DstRegOp.getReg(); unsigned BaseReg = BaseRegOp.getReg(); unsigned TmpReg = DstReg; const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode()); int16_t DstRegClass = Desc.OpInfo[0].RegClass; unsigned DstRegClassID = getContext().getRegisterInfo()->getRegClass(DstRegClass).getID(); bool IsGPR = (DstRegClassID == Mips::GPR32RegClassID) || (DstRegClassID == Mips::GPR64RegClassID); if (!IsLoad || !IsGPR || (BaseReg == DstReg)) { // At this point we need AT to perform the expansions // and we exit if it is not available. TmpReg = getATReg(IDLoc); if (!TmpReg) return; } if (OffsetOp.isImm()) { int64_t LoOffset = OffsetOp.getImm() & 0xffff; int64_t HiOffset = OffsetOp.getImm() & ~0xffff; // If msb of LoOffset is 1(negative number) we must increment // HiOffset to account for the sign-extension of the low part. if (LoOffset & 0x8000) HiOffset += 0x10000; bool IsLargeOffset = HiOffset != 0; if (IsLargeOffset) { bool Is32BitImm = (HiOffset >> 32) == 0; if (loadImmediate(HiOffset, TmpReg, Mips::NoRegister, Is32BitImm, true, IDLoc, Out, STI)) return; } if (BaseReg != Mips::ZERO && BaseReg != Mips::ZERO_64) TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI); TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, LoOffset, IDLoc, STI); + return; + } + + assert(OffsetOp.isExpr() && "expected expression operand kind"); + if (inPicMode()) { + // FIXME: + // a) Fix lw/sw $reg, symbol($reg) instruction expanding. + // b) If expression includes offset (sym + number), do not + // encode the offset into a relocation. Take it in account + // in the last load/store instruction. + // c) Check that immediates of R_MIPS_GOT16/R_MIPS_LO16 relocations + // do not exceed 16-bit. + // d) Use R_MIPS_GOT_PAGE/R_MIPS_GOT_OFST relocations instead + // of R_MIPS_GOT_DISP in appropriate cases to reduce number + // of GOT entries. + expandLoadAddress(TmpReg, Mips::NoRegister, OffsetOp, !ABI.ArePtrs64bit(), + IDLoc, Out, STI); + TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, 0, IDLoc, STI); } else { - assert(OffsetOp.isExpr() && "expected expression operand kind"); const MCExpr *ExprOffset = OffsetOp.getExpr(); MCOperand LoOperand = MCOperand::createExpr( MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext())); MCOperand HiOperand = MCOperand::createExpr( MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext())); if (IsLoad) TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand, LoOperand, TmpReg, IDLoc, STI); else TOut.emitStoreWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand, LoOperand, TmpReg, IDLoc, STI); } } bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { unsigned OpNum = Inst.getNumOperands(); unsigned Opcode = Inst.getOpcode(); unsigned NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM32_MM : Mips::LWM32_MM; assert(Inst.getOperand(OpNum - 1).isImm() && Inst.getOperand(OpNum - 2).isReg() && Inst.getOperand(OpNum - 3).isReg() && "Invalid instruction operand."); if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 && Inst.getOperand(OpNum - 1).getImm() >= 0 && (Inst.getOperand(OpNum - 2).getReg() == Mips::SP || Inst.getOperand(OpNum - 2).getReg() == Mips::SP_64) && (Inst.getOperand(OpNum - 3).getReg() == Mips::RA || Inst.getOperand(OpNum - 3).getReg() == Mips::RA_64)) { // It can be implemented as SWM16 or LWM16 instruction. if (inMicroMipsMode() && hasMips32r6()) NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MMR6 : Mips::LWM16_MMR6; else NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM; } Inst.setOpcode(NewOpcode); Out.EmitInstruction(Inst, *STI); return false; } bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); bool EmittedNoMacroWarning = false; unsigned PseudoOpcode = Inst.getOpcode(); unsigned SrcReg = Inst.getOperand(0).getReg(); const MCOperand &TrgOp = Inst.getOperand(1); const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr(); unsigned ZeroSrcOpcode, ZeroTrgOpcode; bool ReverseOrderSLT, IsUnsigned, IsLikely, AcceptsEquality; unsigned TrgReg; if (TrgOp.isReg()) TrgReg = TrgOp.getReg(); else if (TrgOp.isImm()) { warnIfNoMacro(IDLoc); EmittedNoMacroWarning = true; TrgReg = getATReg(IDLoc); if (!TrgReg) return true; switch(PseudoOpcode) { default: llvm_unreachable("unknown opcode for branch pseudo-instruction"); case Mips::BLTImmMacro: PseudoOpcode = Mips::BLT; break; case Mips::BLEImmMacro: PseudoOpcode = Mips::BLE; break; case Mips::BGEImmMacro: PseudoOpcode = Mips::BGE; break; case Mips::BGTImmMacro: PseudoOpcode = Mips::BGT; break; case Mips::BLTUImmMacro: PseudoOpcode = Mips::BLTU; break; case Mips::BLEUImmMacro: PseudoOpcode = Mips::BLEU; break; case Mips::BGEUImmMacro: PseudoOpcode = Mips::BGEU; break; case Mips::BGTUImmMacro: PseudoOpcode = Mips::BGTU; break; case Mips::BLTLImmMacro: PseudoOpcode = Mips::BLTL; break; case Mips::BLELImmMacro: PseudoOpcode = Mips::BLEL; break; case Mips::BGELImmMacro: PseudoOpcode = Mips::BGEL; break; case Mips::BGTLImmMacro: PseudoOpcode = Mips::BGTL; break; case Mips::BLTULImmMacro: PseudoOpcode = Mips::BLTUL; break; case Mips::BLEULImmMacro: PseudoOpcode = Mips::BLEUL; break; case Mips::BGEULImmMacro: PseudoOpcode = Mips::BGEUL; break; case Mips::BGTULImmMacro: PseudoOpcode = Mips::BGTUL; break; } if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(), false, IDLoc, Out, STI)) return true; } switch (PseudoOpcode) { case Mips::BLT: case Mips::BLTU: case Mips::BLTL: case Mips::BLTUL: AcceptsEquality = false; ReverseOrderSLT = false; IsUnsigned = ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL)); IsLikely = ((PseudoOpcode == Mips::BLTL) || (PseudoOpcode == Mips::BLTUL)); ZeroSrcOpcode = Mips::BGTZ; ZeroTrgOpcode = Mips::BLTZ; break; case Mips::BLE: case Mips::BLEU: case Mips::BLEL: case Mips::BLEUL: AcceptsEquality = true; ReverseOrderSLT = true; IsUnsigned = ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL)); IsLikely = ((PseudoOpcode == Mips::BLEL) || (PseudoOpcode == Mips::BLEUL)); ZeroSrcOpcode = Mips::BGEZ; ZeroTrgOpcode = Mips::BLEZ; break; case Mips::BGE: case Mips::BGEU: case Mips::BGEL: case Mips::BGEUL: AcceptsEquality = true; ReverseOrderSLT = false; IsUnsigned = ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL)); IsLikely = ((PseudoOpcode == Mips::BGEL) || (PseudoOpcode == Mips::BGEUL)); ZeroSrcOpcode = Mips::BLEZ; ZeroTrgOpcode = Mips::BGEZ; break; case Mips::BGT: case Mips::BGTU: case Mips::BGTL: case Mips::BGTUL: AcceptsEquality = false; ReverseOrderSLT = true; IsUnsigned = ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL)); IsLikely = ((PseudoOpcode == Mips::BGTL) || (PseudoOpcode == Mips::BGTUL)); ZeroSrcOpcode = Mips::BLTZ; ZeroTrgOpcode = Mips::BGTZ; break; default: llvm_unreachable("unknown opcode for branch pseudo-instruction"); } bool IsTrgRegZero = (TrgReg == Mips::ZERO); bool IsSrcRegZero = (SrcReg == Mips::ZERO); if (IsSrcRegZero && IsTrgRegZero) { // FIXME: All of these Opcode-specific if's are needed for compatibility // with GAS' behaviour. However, they may not generate the most efficient // code in some circumstances. if (PseudoOpcode == Mips::BLT) { TOut.emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, STI); return false; } if (PseudoOpcode == Mips::BLE) { TOut.emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, STI); Warning(IDLoc, "branch is always taken"); return false; } if (PseudoOpcode == Mips::BGE) { TOut.emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, STI); Warning(IDLoc, "branch is always taken"); return false; } if (PseudoOpcode == Mips::BGT) { TOut.emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, STI); return false; } if (PseudoOpcode == Mips::BGTU) { TOut.emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, STI); return false; } if (AcceptsEquality) { // If both registers are $0 and the pseudo-branch accepts equality, it // will always be taken, so we emit an unconditional branch. TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, STI); Warning(IDLoc, "branch is always taken"); return false; } // If both registers are $0 and the pseudo-branch does not accept // equality, it will never be taken, so we don't have to emit anything. return false; } if (IsSrcRegZero || IsTrgRegZero) { if ((IsSrcRegZero && PseudoOpcode == Mips::BGTU) || (IsTrgRegZero && PseudoOpcode == Mips::BLTU)) { // If the $rs is $0 and the pseudo-branch is BGTU (0 > x) or // if the $rt is $0 and the pseudo-branch is BLTU (x < 0), // the pseudo-branch will never be taken, so we don't emit anything. // This only applies to unsigned pseudo-branches. return false; } if ((IsSrcRegZero && PseudoOpcode == Mips::BLEU) || (IsTrgRegZero && PseudoOpcode == Mips::BGEU)) { // If the $rs is $0 and the pseudo-branch is BLEU (0 <= x) or // if the $rt is $0 and the pseudo-branch is BGEU (x >= 0), // the pseudo-branch will always be taken, so we emit an unconditional // branch. // This only applies to unsigned pseudo-branches. TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, STI); Warning(IDLoc, "branch is always taken"); return false; } if (IsUnsigned) { // If the $rs is $0 and the pseudo-branch is BLTU (0 < x) or // if the $rt is $0 and the pseudo-branch is BGTU (x > 0), // the pseudo-branch will be taken only when the non-zero register is // different from 0, so we emit a BNEZ. // // If the $rs is $0 and the pseudo-branch is BGEU (0 >= x) or // if the $rt is $0 and the pseudo-branch is BLEU (x <= 0), // the pseudo-branch will be taken only when the non-zero register is // equal to 0, so we emit a BEQZ. // // Because only BLEU and BGEU branch on equality, we can use the // AcceptsEquality variable to decide when to emit the BEQZ. TOut.emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE, IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, STI); return false; } // If we have a signed pseudo-branch and one of the registers is $0, // we can use an appropriate compare-to-zero branch. We select which one // to use in the switch statement above. TOut.emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode, IsSrcRegZero ? TrgReg : SrcReg, MCOperand::createExpr(OffsetExpr), IDLoc, STI); return false; } // If neither the SrcReg nor the TrgReg are $0, we need AT to perform the // expansions. If it is not available, we return. unsigned ATRegNum = getATReg(IDLoc); if (!ATRegNum) return true; if (!EmittedNoMacroWarning) warnIfNoMacro(IDLoc); // SLT fits well with 2 of our 4 pseudo-branches: // BLT, where $rs < $rt, translates into "slt $at, $rs, $rt" and // BGT, where $rs > $rt, translates into "slt $at, $rt, $rs". // If the result of the SLT is 1, we branch, and if it's 0, we don't. // This is accomplished by using a BNEZ with the result of the SLT. // // The other 2 pseudo-branches are opposites of the above 2 (BGE with BLT // and BLE with BGT), so we change the BNEZ into a BEQZ. // Because only BGE and BLE branch on equality, we can use the // AcceptsEquality variable to decide when to emit the BEQZ. // Note that the order of the SLT arguments doesn't change between // opposites. // // The same applies to the unsigned variants, except that SLTu is used // instead of SLT. TOut.emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum, ReverseOrderSLT ? TrgReg : SrcReg, ReverseOrderSLT ? SrcReg : TrgReg, IDLoc, STI); TOut.emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL) : (AcceptsEquality ? Mips::BEQ : Mips::BNE), ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, STI); return false; } // Expand a integer division macro. // // Notably we don't have to emit a warning when encountering $rt as the $zero // register, or 0 as an immediate. processInstruction() has already done that. // // The destination register can only be $zero when expanding (S)DivIMacro or // D(S)DivMacro. bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI, const bool IsMips64, const bool Signed) { MipsTargetStreamer &TOut = getTargetStreamer(); warnIfNoMacro(IDLoc); const MCOperand &RdRegOp = Inst.getOperand(0); assert(RdRegOp.isReg() && "expected register operand kind"); unsigned RdReg = RdRegOp.getReg(); const MCOperand &RsRegOp = Inst.getOperand(1); assert(RsRegOp.isReg() && "expected register operand kind"); unsigned RsReg = RsRegOp.getReg(); unsigned RtReg; int64_t ImmValue; const MCOperand &RtOp = Inst.getOperand(2); assert((RtOp.isReg() || RtOp.isImm()) && "expected register or immediate operand kind"); if (RtOp.isReg()) RtReg = RtOp.getReg(); else ImmValue = RtOp.getImm(); unsigned DivOp; unsigned ZeroReg; unsigned SubOp; if (IsMips64) { DivOp = Signed ? Mips::DSDIV : Mips::DUDIV; ZeroReg = Mips::ZERO_64; SubOp = Mips::DSUB; } else { DivOp = Signed ? Mips::SDIV : Mips::UDIV; ZeroReg = Mips::ZERO; SubOp = Mips::SUB; } bool UseTraps = useTraps(); unsigned Opcode = Inst.getOpcode(); bool isDiv = Opcode == Mips::SDivMacro || Opcode == Mips::SDivIMacro || Opcode == Mips::UDivMacro || Opcode == Mips::UDivIMacro || Opcode == Mips::DSDivMacro || Opcode == Mips::DSDivIMacro || Opcode == Mips::DUDivMacro || Opcode == Mips::DUDivIMacro; bool isRem = Opcode == Mips::SRemMacro || Opcode == Mips::SRemIMacro || Opcode == Mips::URemMacro || Opcode == Mips::URemIMacro || Opcode == Mips::DSRemMacro || Opcode == Mips::DSRemIMacro || Opcode == Mips::DURemMacro || Opcode == Mips::DURemIMacro; if (RtOp.isImm()) { unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; if (ImmValue == 0) { if (UseTraps) TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI); else TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI); return false; } if (isRem && (ImmValue == 1 || (Signed && (ImmValue == -1)))) { TOut.emitRRR(Mips::OR, RdReg, ZeroReg, ZeroReg, IDLoc, STI); return false; } else if (isDiv && ImmValue == 1) { TOut.emitRRR(Mips::OR, RdReg, RsReg, Mips::ZERO, IDLoc, STI); return false; } else if (isDiv && Signed && ImmValue == -1) { TOut.emitRRR(SubOp, RdReg, ZeroReg, RsReg, IDLoc, STI); return false; } else { if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue), false, Inst.getLoc(), Out, STI)) return true; TOut.emitRR(DivOp, RsReg, ATReg, IDLoc, STI); TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI); return false; } return true; } // If the macro expansion of (d)div(u) or (d)rem(u) would always trap or // break, insert the trap/break and exit. This gives a different result to // GAS. GAS has an inconsistency/missed optimization in that not all cases // are handled equivalently. As the observed behaviour is the same, we're ok. if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) { if (UseTraps) { TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI); return false; } TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI); return false; } // (d)rem(u) $0, $X, $Y is a special case. Like div $zero, $X, $Y, it does // not expand to macro sequence. if (isRem && (RdReg == Mips::ZERO || RdReg == Mips::ZERO_64)) { TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI); return false; } // Temporary label for first branch traget MCContext &Context = TOut.getStreamer().getContext(); MCSymbol *BrTarget; MCOperand LabelOp; if (UseTraps) { TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI); } else { // Branch to the li instruction. BrTarget = Context.createTempSymbol(); LabelOp = MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context)); TOut.emitRRX(Mips::BNE, RtReg, ZeroReg, LabelOp, IDLoc, STI); } TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI); if (!UseTraps) TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI); if (!Signed) { if (!UseTraps) TOut.getStreamer().EmitLabel(BrTarget); TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI); return false; } unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; if (!UseTraps) TOut.getStreamer().EmitLabel(BrTarget); TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, STI); // Temporary label for the second branch target. MCSymbol *BrTargetEnd = Context.createTempSymbol(); MCOperand LabelOpEnd = MCOperand::createExpr(MCSymbolRefExpr::create(BrTargetEnd, Context)); // Branch to the mflo instruction. TOut.emitRRX(Mips::BNE, RtReg, ATReg, LabelOpEnd, IDLoc, STI); if (IsMips64) { TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI); TOut.emitDSLL(ATReg, ATReg, 63, IDLoc, STI); } else { TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI); } if (UseTraps) TOut.emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, STI); else { // Branch to the mflo instruction. TOut.emitRRX(Mips::BNE, RsReg, ATReg, LabelOpEnd, IDLoc, STI); TOut.emitNop(IDLoc, STI); TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI); } TOut.getStreamer().EmitLabel(BrTargetEnd); TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI); return false; } bool MipsAsmParser::expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(Inst.getNumOperands() == 3 && "Invalid operand count"); assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() && Inst.getOperand(2).isReg() && "Invalid instruction operand."); unsigned FirstReg = Inst.getOperand(0).getReg(); unsigned SecondReg = Inst.getOperand(1).getReg(); unsigned ThirdReg = Inst.getOperand(2).getReg(); if (hasMips1() && !hasMips2()) { unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI); TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI); TOut.emitNop(IDLoc, STI); TOut.emitRRI(Mips::ORi, ATReg, ThirdReg, 0x3, IDLoc, STI); TOut.emitRRI(Mips::XORi, ATReg, ATReg, 0x2, IDLoc, STI); TOut.emitRR(Mips::CTC1, Mips::RA, ATReg, IDLoc, STI); TOut.emitNop(IDLoc, STI); TOut.emitRR(IsDouble ? (Is64FPU ? Mips::CVT_W_D64 : Mips::CVT_W_D32) : Mips::CVT_W_S, FirstReg, SecondReg, IDLoc, STI); TOut.emitRR(Mips::CTC1, Mips::RA, ThirdReg, IDLoc, STI); TOut.emitNop(IDLoc, STI); return false; } TOut.emitRR(IsDouble ? (Is64FPU ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32) : Mips::TRUNC_W_S, FirstReg, SecondReg, IDLoc, STI); return false; } bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { if (hasMips32r6() || hasMips64r6()) { return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6"); } const MCOperand &DstRegOp = Inst.getOperand(0); assert(DstRegOp.isReg() && "expected register operand kind"); const MCOperand &SrcRegOp = Inst.getOperand(1); assert(SrcRegOp.isReg() && "expected register operand kind"); const MCOperand &OffsetImmOp = Inst.getOperand(2); assert(OffsetImmOp.isImm() && "expected immediate operand kind"); MipsTargetStreamer &TOut = getTargetStreamer(); unsigned DstReg = DstRegOp.getReg(); unsigned SrcReg = SrcRegOp.getReg(); int64_t OffsetValue = OffsetImmOp.getImm(); // NOTE: We always need AT for ULHU, as it is always used as the source // register for one of the LBu's. warnIfNoMacro(IDLoc); unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; bool IsLargeOffset = !(isInt<16>(OffsetValue + 1) && isInt<16>(OffsetValue)); if (IsLargeOffset) { if (loadImmediate(OffsetValue, ATReg, SrcReg, !ABI.ArePtrs64bit(), true, IDLoc, Out, STI)) return true; } int64_t FirstOffset = IsLargeOffset ? 0 : OffsetValue; int64_t SecondOffset = IsLargeOffset ? 1 : (OffsetValue + 1); if (isLittle()) std::swap(FirstOffset, SecondOffset); unsigned FirstLbuDstReg = IsLargeOffset ? DstReg : ATReg; unsigned SecondLbuDstReg = IsLargeOffset ? ATReg : DstReg; unsigned LbuSrcReg = IsLargeOffset ? ATReg : SrcReg; unsigned SllReg = IsLargeOffset ? DstReg : ATReg; TOut.emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg, FirstOffset, IDLoc, STI); TOut.emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondOffset, IDLoc, STI); TOut.emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, STI); TOut.emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, STI); return false; } bool MipsAsmParser::expandUsh(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { if (hasMips32r6() || hasMips64r6()) { return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6"); } const MCOperand &DstRegOp = Inst.getOperand(0); assert(DstRegOp.isReg() && "expected register operand kind"); const MCOperand &SrcRegOp = Inst.getOperand(1); assert(SrcRegOp.isReg() && "expected register operand kind"); const MCOperand &OffsetImmOp = Inst.getOperand(2); assert(OffsetImmOp.isImm() && "expected immediate operand kind"); MipsTargetStreamer &TOut = getTargetStreamer(); unsigned DstReg = DstRegOp.getReg(); unsigned SrcReg = SrcRegOp.getReg(); int64_t OffsetValue = OffsetImmOp.getImm(); warnIfNoMacro(IDLoc); unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; bool IsLargeOffset = !(isInt<16>(OffsetValue + 1) && isInt<16>(OffsetValue)); if (IsLargeOffset) { if (loadImmediate(OffsetValue, ATReg, SrcReg, !ABI.ArePtrs64bit(), true, IDLoc, Out, STI)) return true; } int64_t FirstOffset = IsLargeOffset ? 1 : (OffsetValue + 1); int64_t SecondOffset = IsLargeOffset ? 0 : OffsetValue; if (isLittle()) std::swap(FirstOffset, SecondOffset); if (IsLargeOffset) { TOut.emitRRI(Mips::SB, DstReg, ATReg, FirstOffset, IDLoc, STI); TOut.emitRRI(Mips::SRL, DstReg, DstReg, 8, IDLoc, STI); TOut.emitRRI(Mips::SB, DstReg, ATReg, SecondOffset, IDLoc, STI); TOut.emitRRI(Mips::LBu, ATReg, ATReg, 0, IDLoc, STI); TOut.emitRRI(Mips::SLL, DstReg, DstReg, 8, IDLoc, STI); TOut.emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, STI); } else { TOut.emitRRI(Mips::SB, DstReg, SrcReg, FirstOffset, IDLoc, STI); TOut.emitRRI(Mips::SRL, ATReg, DstReg, 8, IDLoc, STI); TOut.emitRRI(Mips::SB, ATReg, SrcReg, SecondOffset, IDLoc, STI); } return false; } bool MipsAsmParser::expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { if (hasMips32r6() || hasMips64r6()) { return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6"); } const MCOperand &DstRegOp = Inst.getOperand(0); assert(DstRegOp.isReg() && "expected register operand kind"); const MCOperand &SrcRegOp = Inst.getOperand(1); assert(SrcRegOp.isReg() && "expected register operand kind"); const MCOperand &OffsetImmOp = Inst.getOperand(2); assert(OffsetImmOp.isImm() && "expected immediate operand kind"); MipsTargetStreamer &TOut = getTargetStreamer(); unsigned DstReg = DstRegOp.getReg(); unsigned SrcReg = SrcRegOp.getReg(); int64_t OffsetValue = OffsetImmOp.getImm(); // Compute left/right load/store offsets. bool IsLargeOffset = !(isInt<16>(OffsetValue + 3) && isInt<16>(OffsetValue)); int64_t LxlOffset = IsLargeOffset ? 0 : OffsetValue; int64_t LxrOffset = IsLargeOffset ? 3 : (OffsetValue + 3); if (isLittle()) std::swap(LxlOffset, LxrOffset); bool IsLoadInst = (Inst.getOpcode() == Mips::Ulw); bool DoMove = IsLoadInst && (SrcReg == DstReg) && !IsLargeOffset; unsigned TmpReg = SrcReg; if (IsLargeOffset || DoMove) { warnIfNoMacro(IDLoc); TmpReg = getATReg(IDLoc); if (!TmpReg) return true; } if (IsLargeOffset) { if (loadImmediate(OffsetValue, TmpReg, SrcReg, !ABI.ArePtrs64bit(), true, IDLoc, Out, STI)) return true; } if (DoMove) std::swap(DstReg, TmpReg); unsigned XWL = IsLoadInst ? Mips::LWL : Mips::SWL; unsigned XWR = IsLoadInst ? Mips::LWR : Mips::SWR; TOut.emitRRI(XWL, DstReg, TmpReg, LxlOffset, IDLoc, STI); TOut.emitRRI(XWR, DstReg, TmpReg, LxrOffset, IDLoc, STI); if (DoMove) TOut.emitRRR(Mips::OR, TmpReg, DstReg, Mips::ZERO, IDLoc, STI); return false; } bool MipsAsmParser::expandSge(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(Inst.getNumOperands() == 3 && "Invalid operand count"); assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() && Inst.getOperand(2).isReg() && "Invalid instruction operand."); unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); unsigned OpReg = Inst.getOperand(2).getReg(); unsigned OpCode; warnIfNoMacro(IDLoc); switch (Inst.getOpcode()) { case Mips::SGE: OpCode = Mips::SLT; break; case Mips::SGEU: OpCode = Mips::SLTu; break; default: llvm_unreachable("unexpected 'sge' opcode"); } // $SrcReg >= $OpReg is equal to (not ($SrcReg < $OpReg)) TOut.emitRRR(OpCode, DstReg, SrcReg, OpReg, IDLoc, STI); TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI); return false; } bool MipsAsmParser::expandSgeImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(Inst.getNumOperands() == 3 && "Invalid operand count"); assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm() && "Invalid instruction operand."); unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); int64_t ImmValue = Inst.getOperand(2).getImm(); unsigned OpRegCode, OpImmCode; warnIfNoMacro(IDLoc); switch (Inst.getOpcode()) { case Mips::SGEImm: case Mips::SGEImm64: OpRegCode = Mips::SLT; OpImmCode = Mips::SLTi; break; case Mips::SGEUImm: case Mips::SGEUImm64: OpRegCode = Mips::SLTu; OpImmCode = Mips::SLTiu; break; default: llvm_unreachable("unexpected 'sge' opcode with immediate"); } // $SrcReg >= Imm is equal to (not ($SrcReg < Imm)) if (isInt<16>(ImmValue)) { // Use immediate version of STL. TOut.emitRRI(OpImmCode, DstReg, SrcReg, ImmValue, IDLoc, STI); TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI); } else { unsigned ImmReg = DstReg; if (DstReg == SrcReg) { unsigned ATReg = getATReg(Inst.getLoc()); if (!ATReg) return true; ImmReg = ATReg; } if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue), false, IDLoc, Out, STI)) return true; TOut.emitRRR(OpRegCode, DstReg, SrcReg, ImmReg, IDLoc, STI); TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI); } return false; } bool MipsAsmParser::expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(Inst.getNumOperands() == 3 && "Invalid operand count"); assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm() && "Invalid instruction operand."); unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); unsigned ImmReg = DstReg; int64_t ImmValue = Inst.getOperand(2).getImm(); unsigned OpCode; warnIfNoMacro(IDLoc); switch (Inst.getOpcode()) { case Mips::SGTImm: case Mips::SGTImm64: OpCode = Mips::SLT; break; case Mips::SGTUImm: case Mips::SGTUImm64: OpCode = Mips::SLTu; break; default: llvm_unreachable("unexpected 'sgt' opcode with immediate"); } if (DstReg == SrcReg) { unsigned ATReg = getATReg(Inst.getLoc()); if (!ATReg) return true; ImmReg = ATReg; } if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue), false, IDLoc, Out, STI)) return true; // $SrcReg > $ImmReg is equal to $ImmReg < $SrcReg TOut.emitRRR(OpCode, DstReg, ImmReg, SrcReg, IDLoc, STI); return false; } bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(Inst.getNumOperands() == 3 && "Invalid operand count"); assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm() && "Invalid instruction operand."); unsigned ATReg = Mips::NoRegister; unsigned FinalDstReg = Mips::NoRegister; unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); int64_t ImmValue = Inst.getOperand(2).getImm(); bool Is32Bit = isInt<32>(ImmValue) || (!isGP64bit() && isUInt<32>(ImmValue)); unsigned FinalOpcode = Inst.getOpcode(); if (DstReg == SrcReg) { ATReg = getATReg(Inst.getLoc()); if (!ATReg) return true; FinalDstReg = DstReg; DstReg = ATReg; } if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Out, STI)) { switch (FinalOpcode) { default: llvm_unreachable("unimplemented expansion"); case Mips::ADDi: FinalOpcode = Mips::ADD; break; case Mips::ADDiu: FinalOpcode = Mips::ADDu; break; case Mips::ANDi: FinalOpcode = Mips::AND; break; case Mips::NORImm: FinalOpcode = Mips::NOR; break; case Mips::ORi: FinalOpcode = Mips::OR; break; case Mips::SLTi: FinalOpcode = Mips::SLT; break; case Mips::SLTiu: FinalOpcode = Mips::SLTu; break; case Mips::XORi: FinalOpcode = Mips::XOR; break; case Mips::ADDi_MM: FinalOpcode = Mips::ADD_MM; break; case Mips::ADDiu_MM: FinalOpcode = Mips::ADDu_MM; break; case Mips::ANDi_MM: FinalOpcode = Mips::AND_MM; break; case Mips::ORi_MM: FinalOpcode = Mips::OR_MM; break; case Mips::SLTi_MM: FinalOpcode = Mips::SLT_MM; break; case Mips::SLTiu_MM: FinalOpcode = Mips::SLTu_MM; break; case Mips::XORi_MM: FinalOpcode = Mips::XOR_MM; break; case Mips::ANDi64: FinalOpcode = Mips::AND64; break; case Mips::NORImm64: FinalOpcode = Mips::NOR64; break; case Mips::ORi64: FinalOpcode = Mips::OR64; break; case Mips::SLTImm64: FinalOpcode = Mips::SLT64; break; case Mips::SLTUImm64: FinalOpcode = Mips::SLTu64; break; case Mips::XORi64: FinalOpcode = Mips::XOR64; break; } if (FinalDstReg == Mips::NoRegister) TOut.emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, STI); else TOut.emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc, STI); return false; } return true; } bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned ATReg = Mips::NoRegister; unsigned DReg = Inst.getOperand(0).getReg(); unsigned SReg = Inst.getOperand(1).getReg(); unsigned TReg = Inst.getOperand(2).getReg(); unsigned TmpReg = DReg; unsigned FirstShift = Mips::NOP; unsigned SecondShift = Mips::NOP; if (hasMips32r2()) { if (DReg == SReg) { TmpReg = getATReg(Inst.getLoc()); if (!TmpReg) return true; } if (Inst.getOpcode() == Mips::ROL) { TOut.emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI); TOut.emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI); return false; } if (Inst.getOpcode() == Mips::ROR) { TOut.emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), STI); return false; } return true; } if (hasMips32()) { switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected instruction opcode"); case Mips::ROL: FirstShift = Mips::SRLV; SecondShift = Mips::SLLV; break; case Mips::ROR: FirstShift = Mips::SLLV; SecondShift = Mips::SRLV; break; } ATReg = getATReg(Inst.getLoc()); if (!ATReg) return true; TOut.emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI); TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI); TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI); TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI); return false; } return true; } bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned ATReg = Mips::NoRegister; unsigned DReg = Inst.getOperand(0).getReg(); unsigned SReg = Inst.getOperand(1).getReg(); int64_t ImmValue = Inst.getOperand(2).getImm(); unsigned FirstShift = Mips::NOP; unsigned SecondShift = Mips::NOP; if (hasMips32r2()) { if (Inst.getOpcode() == Mips::ROLImm) { uint64_t MaxShift = 32; uint64_t ShiftValue = ImmValue; if (ImmValue != 0) ShiftValue = MaxShift - ImmValue; TOut.emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), STI); return false; } if (Inst.getOpcode() == Mips::RORImm) { TOut.emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), STI); return false; } return true; } if (hasMips32()) { if (ImmValue == 0) { TOut.emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), STI); return false; } switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected instruction opcode"); case Mips::ROLImm: FirstShift = Mips::SLL; SecondShift = Mips::SRL; break; case Mips::RORImm: FirstShift = Mips::SRL; SecondShift = Mips::SLL; break; } ATReg = getATReg(Inst.getLoc()); if (!ATReg) return true; TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), STI); TOut.emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), STI); TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI); return false; } return true; } bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned ATReg = Mips::NoRegister; unsigned DReg = Inst.getOperand(0).getReg(); unsigned SReg = Inst.getOperand(1).getReg(); unsigned TReg = Inst.getOperand(2).getReg(); unsigned TmpReg = DReg; unsigned FirstShift = Mips::NOP; unsigned SecondShift = Mips::NOP; if (hasMips64r2()) { if (TmpReg == SReg) { TmpReg = getATReg(Inst.getLoc()); if (!TmpReg) return true; } if (Inst.getOpcode() == Mips::DROL) { TOut.emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI); TOut.emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI); return false; } if (Inst.getOpcode() == Mips::DROR) { TOut.emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), STI); return false; } return true; } if (hasMips64()) { switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected instruction opcode"); case Mips::DROL: FirstShift = Mips::DSRLV; SecondShift = Mips::DSLLV; break; case Mips::DROR: FirstShift = Mips::DSLLV; SecondShift = Mips::DSRLV; break; } ATReg = getATReg(Inst.getLoc()); if (!ATReg) return true; TOut.emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI); TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI); TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI); TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI); return false; } return true; } bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned ATReg = Mips::NoRegister; unsigned DReg = Inst.getOperand(0).getReg(); unsigned SReg = Inst.getOperand(1).getReg(); int64_t ImmValue = Inst.getOperand(2).getImm() % 64; unsigned FirstShift = Mips::NOP; unsigned SecondShift = Mips::NOP; MCInst TmpInst; if (hasMips64r2()) { unsigned FinalOpcode = Mips::NOP; if (ImmValue == 0) FinalOpcode = Mips::DROTR; else if (ImmValue % 32 == 0) FinalOpcode = Mips::DROTR32; else if ((ImmValue >= 1) && (ImmValue <= 32)) { if (Inst.getOpcode() == Mips::DROLImm) FinalOpcode = Mips::DROTR32; else FinalOpcode = Mips::DROTR; } else if (ImmValue >= 33) { if (Inst.getOpcode() == Mips::DROLImm) FinalOpcode = Mips::DROTR; else FinalOpcode = Mips::DROTR32; } uint64_t ShiftValue = ImmValue % 32; if (Inst.getOpcode() == Mips::DROLImm) ShiftValue = (32 - ImmValue % 32) % 32; TOut.emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), STI); return false; } if (hasMips64()) { if (ImmValue == 0) { TOut.emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), STI); return false; } switch (Inst.getOpcode()) { default: llvm_unreachable("unexpected instruction opcode"); case Mips::DROLImm: if ((ImmValue >= 1) && (ImmValue <= 31)) { FirstShift = Mips::DSLL; SecondShift = Mips::DSRL32; } if (ImmValue == 32) { FirstShift = Mips::DSLL32; SecondShift = Mips::DSRL32; } if ((ImmValue >= 33) && (ImmValue <= 63)) { FirstShift = Mips::DSLL32; SecondShift = Mips::DSRL; } break; case Mips::DRORImm: if ((ImmValue >= 1) && (ImmValue <= 31)) { FirstShift = Mips::DSRL; SecondShift = Mips::DSLL32; } if (ImmValue == 32) { FirstShift = Mips::DSRL32; SecondShift = Mips::DSLL32; } if ((ImmValue >= 33) && (ImmValue <= 63)) { FirstShift = Mips::DSRL32; SecondShift = Mips::DSLL; } break; } ATReg = getATReg(Inst.getLoc()); if (!ATReg) return true; TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), STI); TOut.emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32, Inst.getLoc(), STI); TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI); return false; } return true; } bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned FirstRegOp = Inst.getOperand(0).getReg(); unsigned SecondRegOp = Inst.getOperand(1).getReg(); TOut.emitRI(Mips::BGEZ, SecondRegOp, 8, IDLoc, STI); if (FirstRegOp != SecondRegOp) TOut.emitRRR(Mips::ADDu, FirstRegOp, SecondRegOp, Mips::ZERO, IDLoc, STI); else TOut.emitEmptyDelaySlot(false, IDLoc, STI); TOut.emitRRR(Mips::SUB, FirstRegOp, Mips::ZERO, SecondRegOp, IDLoc, STI); return false; } bool MipsAsmParser::expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned ATReg = Mips::NoRegister; unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); int32_t ImmValue = Inst.getOperand(2).getImm(); ATReg = getATReg(IDLoc); if (!ATReg) return true; loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out, STI); TOut.emitRR(Inst.getOpcode() == Mips::MULImmMacro ? Mips::MULT : Mips::DMULT, SrcReg, ATReg, IDLoc, STI); TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI); return false; } bool MipsAsmParser::expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned ATReg = Mips::NoRegister; unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); unsigned TmpReg = Inst.getOperand(2).getReg(); ATReg = getATReg(Inst.getLoc()); if (!ATReg) return true; TOut.emitRR(Inst.getOpcode() == Mips::MULOMacro ? Mips::MULT : Mips::DMULT, SrcReg, TmpReg, IDLoc, STI); TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI); TOut.emitRRI(Inst.getOpcode() == Mips::MULOMacro ? Mips::SRA : Mips::DSRA32, DstReg, DstReg, 0x1F, IDLoc, STI); TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI); if (useTraps()) { TOut.emitRRI(Mips::TNE, DstReg, ATReg, 6, IDLoc, STI); } else { MCContext & Context = TOut.getStreamer().getContext(); MCSymbol * BrTarget = Context.createTempSymbol(); MCOperand LabelOp = MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context)); TOut.emitRRX(Mips::BEQ, DstReg, ATReg, LabelOp, IDLoc, STI); if (AssemblerOptions.back()->isReorder()) TOut.emitNop(IDLoc, STI); TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI); TOut.getStreamer().EmitLabel(BrTarget); } TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI); return false; } bool MipsAsmParser::expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned ATReg = Mips::NoRegister; unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); unsigned TmpReg = Inst.getOperand(2).getReg(); ATReg = getATReg(IDLoc); if (!ATReg) return true; TOut.emitRR(Inst.getOpcode() == Mips::MULOUMacro ? Mips::MULTu : Mips::DMULTu, SrcReg, TmpReg, IDLoc, STI); TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI); TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI); if (useTraps()) { TOut.emitRRI(Mips::TNE, ATReg, Mips::ZERO, 6, IDLoc, STI); } else { MCContext & Context = TOut.getStreamer().getContext(); MCSymbol * BrTarget = Context.createTempSymbol(); MCOperand LabelOp = MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context)); TOut.emitRRX(Mips::BEQ, ATReg, Mips::ZERO, LabelOp, IDLoc, STI); if (AssemblerOptions.back()->isReorder()) TOut.emitNop(IDLoc, STI); TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI); TOut.getStreamer().EmitLabel(BrTarget); } return false; } bool MipsAsmParser::expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); unsigned TmpReg = Inst.getOperand(2).getReg(); TOut.emitRR(Mips::DMULTu, SrcReg, TmpReg, IDLoc, STI); TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI); return false; } // Expand 'ld $ offset($reg2)' to 'lw $, offset($reg2); // lw $>, offset+4($reg2)' // or expand 'sd $ offset($reg2)' to 'sw $, offset($reg2); // sw $>, offset+4($reg2)' // for O32. bool MipsAsmParser::expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI, bool IsLoad) { if (!isABI_O32()) return true; warnIfNoMacro(IDLoc); MipsTargetStreamer &TOut = getTargetStreamer(); unsigned Opcode = IsLoad ? Mips::LW : Mips::SW; unsigned FirstReg = Inst.getOperand(0).getReg(); unsigned SecondReg = nextReg(FirstReg); unsigned BaseReg = Inst.getOperand(1).getReg(); if (!SecondReg) return true; warnIfRegIndexIsAT(FirstReg, IDLoc); assert(Inst.getOperand(2).isImm() && "Offset for load macro is not immediate!"); MCOperand &FirstOffset = Inst.getOperand(2); signed NextOffset = FirstOffset.getImm() + 4; MCOperand SecondOffset = MCOperand::createImm(NextOffset); if (!isInt<16>(FirstOffset.getImm()) || !isInt<16>(NextOffset)) return true; // For loads, clobber the base register with the second load instead of the // first if the BaseReg == FirstReg. if (FirstReg != BaseReg || !IsLoad) { TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI); TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI); } else { TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI); TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI); } return false; } // Expand 's.d $ offset($reg2)' to 'swc1 $, offset($reg2); // swc1 $, offset+4($reg2)' // or if little endian to 'swc1 $, offset($reg2); // swc1 $, offset+4($reg2)' // for Mips1. bool MipsAsmParser::expandStoreDM1Macro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { if (!isABI_O32()) return true; warnIfNoMacro(IDLoc); MipsTargetStreamer &TOut = getTargetStreamer(); unsigned Opcode = Mips::SWC1; unsigned FirstReg = Inst.getOperand(0).getReg(); unsigned SecondReg = nextReg(FirstReg); unsigned BaseReg = Inst.getOperand(1).getReg(); if (!SecondReg) return true; warnIfRegIndexIsAT(FirstReg, IDLoc); assert(Inst.getOperand(2).isImm() && "Offset for macro is not immediate!"); MCOperand &FirstOffset = Inst.getOperand(2); signed NextOffset = FirstOffset.getImm() + 4; MCOperand SecondOffset = MCOperand::createImm(NextOffset); if (!isInt<16>(FirstOffset.getImm()) || !isInt<16>(NextOffset)) return true; if (!IsLittleEndian) std::swap(FirstReg, SecondReg); TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI); TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI); return false; } bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(Inst.getNumOperands() == 3 && "Invalid operand count"); assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() && Inst.getOperand(2).isReg() && "Invalid instruction operand."); unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); unsigned OpReg = Inst.getOperand(2).getReg(); warnIfNoMacro(IDLoc); if (SrcReg != Mips::ZERO && OpReg != Mips::ZERO) { TOut.emitRRR(Mips::XOR, DstReg, SrcReg, OpReg, IDLoc, STI); TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI); return false; } unsigned Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg; TOut.emitRRI(Mips::SLTiu, DstReg, Reg, 1, IDLoc, STI); return false; } bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); assert(Inst.getNumOperands() == 3 && "Invalid operand count"); assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm() && "Invalid instruction operand."); unsigned DstReg = Inst.getOperand(0).getReg(); unsigned SrcReg = Inst.getOperand(1).getReg(); int64_t Imm = Inst.getOperand(2).getImm(); warnIfNoMacro(IDLoc); if (Imm == 0) { TOut.emitRRI(Mips::SLTiu, DstReg, SrcReg, 1, IDLoc, STI); return false; } if (SrcReg == Mips::ZERO) { Warning(IDLoc, "comparison is always false"); TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, DstReg, SrcReg, SrcReg, IDLoc, STI); return false; } unsigned Opc; if (Imm > -0x8000 && Imm < 0) { Imm = -Imm; Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu; } else { Opc = Mips::XORi; } if (!isUInt<16>(Imm)) { unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; if (loadImmediate(Imm, ATReg, Mips::NoRegister, true, isGP64bit(), IDLoc, Out, STI)) return true; TOut.emitRRR(Mips::XOR, DstReg, SrcReg, ATReg, IDLoc, STI); TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI); return false; } TOut.emitRRI(Opc, DstReg, SrcReg, Imm, IDLoc, STI); TOut.emitRRI(Mips::SLTiu, DstReg, DstReg, 1, IDLoc, STI); return false; } // Map the DSP accumulator and control register to the corresponding gpr // operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions // do not map the DSP registers contigously to gpr registers. static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) { switch (Inst.getOpcode()) { case Mips::MFTLO: case Mips::MTTLO: switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { case Mips::AC0: return Mips::ZERO; case Mips::AC1: return Mips::A0; case Mips::AC2: return Mips::T0; case Mips::AC3: return Mips::T4; default: llvm_unreachable("Unknown register for 'mttr' alias!"); } case Mips::MFTHI: case Mips::MTTHI: switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { case Mips::AC0: return Mips::AT; case Mips::AC1: return Mips::A1; case Mips::AC2: return Mips::T1; case Mips::AC3: return Mips::T5; default: llvm_unreachable("Unknown register for 'mttr' alias!"); } case Mips::MFTACX: case Mips::MTTACX: switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { case Mips::AC0: return Mips::V0; case Mips::AC1: return Mips::A2; case Mips::AC2: return Mips::T2; case Mips::AC3: return Mips::T6; default: llvm_unreachable("Unknown register for 'mttr' alias!"); } case Mips::MFTDSP: case Mips::MTTDSP: return Mips::S0; default: llvm_unreachable("Unknown instruction for 'mttr' dsp alias!"); } } // Map the floating point register operand to the corresponding register // operand. static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) { switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg()) { case Mips::F0: return Mips::ZERO; case Mips::F1: return Mips::AT; case Mips::F2: return Mips::V0; case Mips::F3: return Mips::V1; case Mips::F4: return Mips::A0; case Mips::F5: return Mips::A1; case Mips::F6: return Mips::A2; case Mips::F7: return Mips::A3; case Mips::F8: return Mips::T0; case Mips::F9: return Mips::T1; case Mips::F10: return Mips::T2; case Mips::F11: return Mips::T3; case Mips::F12: return Mips::T4; case Mips::F13: return Mips::T5; case Mips::F14: return Mips::T6; case Mips::F15: return Mips::T7; case Mips::F16: return Mips::S0; case Mips::F17: return Mips::S1; case Mips::F18: return Mips::S2; case Mips::F19: return Mips::S3; case Mips::F20: return Mips::S4; case Mips::F21: return Mips::S5; case Mips::F22: return Mips::S6; case Mips::F23: return Mips::S7; case Mips::F24: return Mips::T8; case Mips::F25: return Mips::T9; case Mips::F26: return Mips::K0; case Mips::F27: return Mips::K1; case Mips::F28: return Mips::GP; case Mips::F29: return Mips::SP; case Mips::F30: return Mips::FP; case Mips::F31: return Mips::RA; default: llvm_unreachable("Unknown register for mttc1 alias!"); } } // Map the coprocessor operand the corresponding gpr register operand. static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) { switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg()) { case Mips::COP00: return Mips::ZERO; case Mips::COP01: return Mips::AT; case Mips::COP02: return Mips::V0; case Mips::COP03: return Mips::V1; case Mips::COP04: return Mips::A0; case Mips::COP05: return Mips::A1; case Mips::COP06: return Mips::A2; case Mips::COP07: return Mips::A3; case Mips::COP08: return Mips::T0; case Mips::COP09: return Mips::T1; case Mips::COP010: return Mips::T2; case Mips::COP011: return Mips::T3; case Mips::COP012: return Mips::T4; case Mips::COP013: return Mips::T5; case Mips::COP014: return Mips::T6; case Mips::COP015: return Mips::T7; case Mips::COP016: return Mips::S0; case Mips::COP017: return Mips::S1; case Mips::COP018: return Mips::S2; case Mips::COP019: return Mips::S3; case Mips::COP020: return Mips::S4; case Mips::COP021: return Mips::S5; case Mips::COP022: return Mips::S6; case Mips::COP023: return Mips::S7; case Mips::COP024: return Mips::T8; case Mips::COP025: return Mips::T9; case Mips::COP026: return Mips::K0; case Mips::COP027: return Mips::K1; case Mips::COP028: return Mips::GP; case Mips::COP029: return Mips::SP; case Mips::COP030: return Mips::FP; case Mips::COP031: return Mips::RA; default: llvm_unreachable("Unknown register for mttc0 alias!"); } } /// Expand an alias of 'mftr' or 'mttr' into the full instruction, by producing /// an mftr or mttr with the correctly mapped gpr register, u, sel and h bits. bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { MipsTargetStreamer &TOut = getTargetStreamer(); unsigned rd = 0; unsigned u = 1; unsigned sel = 0; unsigned h = 0; bool IsMFTR = false; switch (Inst.getOpcode()) { case Mips::MFTC0: IsMFTR = true; LLVM_FALLTHROUGH; case Mips::MTTC0: u = 0; rd = getRegisterForMxtrC0(Inst, IsMFTR); sel = Inst.getOperand(2).getImm(); break; case Mips::MFTGPR: IsMFTR = true; LLVM_FALLTHROUGH; case Mips::MTTGPR: rd = Inst.getOperand(IsMFTR ? 1 : 0).getReg(); break; case Mips::MFTLO: case Mips::MFTHI: case Mips::MFTACX: case Mips::MFTDSP: IsMFTR = true; LLVM_FALLTHROUGH; case Mips::MTTLO: case Mips::MTTHI: case Mips::MTTACX: case Mips::MTTDSP: rd = getRegisterForMxtrDSP(Inst, IsMFTR); sel = 1; break; case Mips::MFTHC1: h = 1; LLVM_FALLTHROUGH; case Mips::MFTC1: IsMFTR = true; rd = getRegisterForMxtrFP(Inst, IsMFTR); sel = 2; break; case Mips::MTTHC1: h = 1; LLVM_FALLTHROUGH; case Mips::MTTC1: rd = getRegisterForMxtrFP(Inst, IsMFTR); sel = 2; break; case Mips::CFTC1: IsMFTR = true; LLVM_FALLTHROUGH; case Mips::CTTC1: rd = getRegisterForMxtrFP(Inst, IsMFTR); sel = 3; break; } unsigned Op0 = IsMFTR ? Inst.getOperand(0).getReg() : rd; unsigned Op1 = IsMFTR ? rd : (Inst.getOpcode() != Mips::MTTDSP ? Inst.getOperand(1).getReg() : Inst.getOperand(0).getReg()); TOut.emitRRIII(IsMFTR ? Mips::MFTR : Mips::MTTR, Op0, Op1, u, sel, h, IDLoc, STI); return false; } unsigned MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst, const OperandVector &Operands) { switch (Inst.getOpcode()) { default: return Match_Success; case Mips::DATI: case Mips::DAHI: if (static_cast(*Operands[1]) .isValidForTie(static_cast(*Operands[2]))) return Match_Success; return Match_RequiresSameSrcAndDst; } } unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) { switch (Inst.getOpcode()) { // As described by the MIPSR6 spec, daui must not use the zero operand for // its source operand. case Mips::DAUI: if (Inst.getOperand(1).getReg() == Mips::ZERO || Inst.getOperand(1).getReg() == Mips::ZERO_64) return Match_RequiresNoZeroRegister; return Match_Success; // As described by the Mips32r2 spec, the registers Rd and Rs for // jalr.hb must be different. // It also applies for registers Rt and Rs of microMIPSr6 jalrc.hb instruction // and registers Rd and Base for microMIPS lwp instruction case Mips::JALR_HB: case Mips::JALR_HB64: case Mips::JALRC_HB_MMR6: case Mips::JALRC_MMR6: if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) return Match_RequiresDifferentSrcAndDst; return Match_Success; case Mips::LWP_MM: if (Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) return Match_RequiresDifferentSrcAndDst; return Match_Success; case Mips::SYNC: if (Inst.getOperand(0).getImm() != 0 && !hasMips32()) return Match_NonZeroOperandForSync; return Match_Success; case Mips::MFC0: case Mips::MTC0: case Mips::MTC2: case Mips::MFC2: if (Inst.getOperand(2).getImm() != 0 && !hasMips32()) return Match_NonZeroOperandForMTCX; return Match_Success; // As described the MIPSR6 spec, the compact branches that compare registers // must: // a) Not use the zero register. // b) Not use the same register twice. // c) rs < rt for bnec, beqc. // NB: For this case, the encoding will swap the operands as their // ordering doesn't matter. GAS performs this transformation too. // Hence, that constraint does not have to be enforced. // // The compact branches that branch iff the signed addition of two registers // would overflow must have rs >= rt. That can be handled like beqc/bnec with // operand swapping. They do not have restriction of using the zero register. case Mips::BLEZC: case Mips::BLEZC_MMR6: case Mips::BGEZC: case Mips::BGEZC_MMR6: case Mips::BGTZC: case Mips::BGTZC_MMR6: case Mips::BLTZC: case Mips::BLTZC_MMR6: case Mips::BEQZC: case Mips::BEQZC_MMR6: case Mips::BNEZC: case Mips::BNEZC_MMR6: case Mips::BLEZC64: case Mips::BGEZC64: case Mips::BGTZC64: case Mips::BLTZC64: case Mips::BEQZC64: case Mips::BNEZC64: if (Inst.getOperand(0).getReg() == Mips::ZERO || Inst.getOperand(0).getReg() == Mips::ZERO_64) return Match_RequiresNoZeroRegister; return Match_Success; case Mips::BGEC: case Mips::BGEC_MMR6: case Mips::BLTC: case Mips::BLTC_MMR6: case Mips::BGEUC: case Mips::BGEUC_MMR6: case Mips::BLTUC: case Mips::BLTUC_MMR6: case Mips::BEQC: case Mips::BEQC_MMR6: case Mips::BNEC: case Mips::BNEC_MMR6: case Mips::BGEC64: case Mips::BLTC64: case Mips::BGEUC64: case Mips::BLTUC64: case Mips::BEQC64: case Mips::BNEC64: if (Inst.getOperand(0).getReg() == Mips::ZERO || Inst.getOperand(0).getReg() == Mips::ZERO_64) return Match_RequiresNoZeroRegister; if (Inst.getOperand(1).getReg() == Mips::ZERO || Inst.getOperand(1).getReg() == Mips::ZERO_64) return Match_RequiresNoZeroRegister; if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) return Match_RequiresDifferentOperands; return Match_Success; case Mips::DINS: { assert(Inst.getOperand(2).isImm() && Inst.getOperand(3).isImm() && "Operands must be immediates for dins!"); const signed Pos = Inst.getOperand(2).getImm(); const signed Size = Inst.getOperand(3).getImm(); if ((0 > (Pos + Size)) || ((Pos + Size) > 32)) return Match_RequiresPosSizeRange0_32; return Match_Success; } case Mips::DINSM: case Mips::DINSU: { assert(Inst.getOperand(2).isImm() && Inst.getOperand(3).isImm() && "Operands must be immediates for dinsm/dinsu!"); const signed Pos = Inst.getOperand(2).getImm(); const signed Size = Inst.getOperand(3).getImm(); if ((32 >= (Pos + Size)) || ((Pos + Size) > 64)) return Match_RequiresPosSizeRange33_64; return Match_Success; } case Mips::DEXT: { assert(Inst.getOperand(2).isImm() && Inst.getOperand(3).isImm() && "Operands must be immediates for DEXTM!"); const signed Pos = Inst.getOperand(2).getImm(); const signed Size = Inst.getOperand(3).getImm(); if ((1 > (Pos + Size)) || ((Pos + Size) > 63)) return Match_RequiresPosSizeUImm6; return Match_Success; } case Mips::DEXTM: case Mips::DEXTU: { assert(Inst.getOperand(2).isImm() && Inst.getOperand(3).isImm() && "Operands must be immediates for dextm/dextu!"); const signed Pos = Inst.getOperand(2).getImm(); const signed Size = Inst.getOperand(3).getImm(); if ((32 > (Pos + Size)) || ((Pos + Size) > 64)) return Match_RequiresPosSizeRange33_64; return Match_Success; } case Mips::CRC32B: case Mips::CRC32CB: case Mips::CRC32H: case Mips::CRC32CH: case Mips::CRC32W: case Mips::CRC32CW: case Mips::CRC32D: case Mips::CRC32CD: if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) return Match_RequiresSameSrcAndDst; return Match_Success; } uint64_t TSFlags = getInstDesc(Inst.getOpcode()).TSFlags; if ((TSFlags & MipsII::HasFCCRegOperand) && (Inst.getOperand(0).getReg() != Mips::FCC0) && !hasEightFccRegisters()) return Match_NoFCCRegisterForCurrentISA; return Match_Success; } static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands, uint64_t ErrorInfo) { if (ErrorInfo != ~0ULL && ErrorInfo < Operands.size()) { SMLoc ErrorLoc = Operands[ErrorInfo]->getStartLoc(); if (ErrorLoc == SMLoc()) return Loc; return ErrorLoc; } return Loc; } bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { MCInst Inst; unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm); switch (MatchResult) { case Match_Success: if (processInstruction(Inst, IDLoc, Out, STI)) return true; return false; case Match_MissingFeature: Error(IDLoc, "instruction requires a CPU feature not currently enabled"); return true; case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); ErrorLoc = Operands[ErrorInfo]->getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; } return Error(ErrorLoc, "invalid operand for instruction"); } case Match_NonZeroOperandForSync: return Error(IDLoc, "s-type must be zero or unspecified for pre-MIPS32 ISAs"); case Match_NonZeroOperandForMTCX: return Error(IDLoc, "selector must be zero for pre-MIPS32 ISAs"); case Match_MnemonicFail: return Error(IDLoc, "invalid instruction"); case Match_RequiresDifferentSrcAndDst: return Error(IDLoc, "source and destination must be different"); case Match_RequiresDifferentOperands: return Error(IDLoc, "registers must be different"); case Match_RequiresNoZeroRegister: return Error(IDLoc, "invalid operand ($zero) for instruction"); case Match_RequiresSameSrcAndDst: return Error(IDLoc, "source and destination must match"); case Match_NoFCCRegisterForCurrentISA: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "non-zero fcc register doesn't exist in current ISA level"); case Match_Immz: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected '0'"); case Match_UImm1_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 1-bit unsigned immediate"); case Match_UImm2_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 2-bit unsigned immediate"); case Match_UImm2_1: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected immediate in range 1 .. 4"); case Match_UImm3_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 3-bit unsigned immediate"); case Match_UImm4_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 4-bit unsigned immediate"); case Match_SImm4_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 4-bit signed immediate"); case Match_UImm5_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 5-bit unsigned immediate"); case Match_SImm5_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 5-bit signed immediate"); case Match_UImm5_1: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected immediate in range 1 .. 32"); case Match_UImm5_32: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected immediate in range 32 .. 63"); case Match_UImm5_33: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected immediate in range 33 .. 64"); case Match_UImm5_0_Report_UImm6: // This is used on UImm5 operands that have a corresponding UImm5_32 // operand to avoid confusing the user. return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 6-bit unsigned immediate"); case Match_UImm5_Lsl2: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected both 7-bit unsigned immediate and multiple of 4"); case Match_UImmRange2_64: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected immediate in range 2 .. 64"); case Match_UImm6_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 6-bit unsigned immediate"); case Match_UImm6_Lsl2: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected both 8-bit unsigned immediate and multiple of 4"); case Match_SImm6_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 6-bit signed immediate"); case Match_UImm7_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 7-bit unsigned immediate"); case Match_UImm7_N1: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected immediate in range -1 .. 126"); case Match_SImm7_Lsl2: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected both 9-bit signed immediate and multiple of 4"); case Match_UImm8_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 8-bit unsigned immediate"); case Match_UImm10_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 10-bit unsigned immediate"); case Match_SImm10_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 10-bit signed immediate"); case Match_SImm11_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 11-bit signed immediate"); case Match_UImm16: case Match_UImm16_Relaxed: case Match_UImm16_AltRelaxed: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 16-bit unsigned immediate"); case Match_SImm16: case Match_SImm16_Relaxed: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 16-bit signed immediate"); case Match_SImm19_Lsl2: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected both 19-bit signed immediate and multiple of 4"); case Match_UImm20_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 20-bit unsigned immediate"); case Match_UImm26_0: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 26-bit unsigned immediate"); case Match_SImm32: case Match_SImm32_Relaxed: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 32-bit signed immediate"); case Match_UImm32_Coerced: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected 32-bit immediate"); case Match_MemSImm9: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected memory with 9-bit signed offset"); case Match_MemSImm10: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected memory with 10-bit signed offset"); case Match_MemSImm10Lsl1: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected memory with 11-bit signed offset and multiple of 2"); case Match_MemSImm10Lsl2: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected memory with 12-bit signed offset and multiple of 4"); case Match_MemSImm10Lsl3: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected memory with 13-bit signed offset and multiple of 8"); case Match_MemSImm11: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected memory with 11-bit signed offset"); case Match_MemSImm12: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected memory with 12-bit signed offset"); case Match_MemSImm16: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected memory with 16-bit signed offset"); case Match_MemSImmPtr: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected memory with 32-bit signed offset"); case Match_RequiresPosSizeRange0_32: { SMLoc ErrorStart = Operands[3]->getStartLoc(); SMLoc ErrorEnd = Operands[4]->getEndLoc(); return Error(ErrorStart, "size plus position are not in the range 0 .. 32", SMRange(ErrorStart, ErrorEnd)); } case Match_RequiresPosSizeUImm6: { SMLoc ErrorStart = Operands[3]->getStartLoc(); SMLoc ErrorEnd = Operands[4]->getEndLoc(); return Error(ErrorStart, "size plus position are not in the range 1 .. 63", SMRange(ErrorStart, ErrorEnd)); } case Match_RequiresPosSizeRange33_64: { SMLoc ErrorStart = Operands[3]->getStartLoc(); SMLoc ErrorEnd = Operands[4]->getEndLoc(); return Error(ErrorStart, "size plus position are not in the range 33 .. 64", SMRange(ErrorStart, ErrorEnd)); } } llvm_unreachable("Implement any new match types added!"); } void MipsAsmParser::warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc) { if (RegIndex != 0 && AssemblerOptions.back()->getATRegIndex() == RegIndex) Warning(Loc, "used $at (currently $" + Twine(RegIndex) + ") without \".set noat\""); } void MipsAsmParser::warnIfNoMacro(SMLoc Loc) { if (!AssemblerOptions.back()->isMacro()) Warning(Loc, "macro instruction expanded into multiple instructions"); } void MipsAsmParser::ConvertXWPOperands(MCInst &Inst, const OperandVector &Operands) { assert( (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM) && "Unexpected instruction!"); ((MipsOperand &)*Operands[1]).addGPR32ZeroAsmRegOperands(Inst, 1); int NextReg = nextReg(((MipsOperand &)*Operands[1]).getGPR32Reg()); Inst.addOperand(MCOperand::createReg(NextReg)); ((MipsOperand &)*Operands[2]).addMemOperands(Inst, 2); } void MipsAsmParser::printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg, SMRange Range, bool ShowColors) { getSourceManager().PrintMessage(Range.Start, SourceMgr::DK_Warning, Msg, Range, SMFixIt(Range, FixMsg), ShowColors); } int MipsAsmParser::matchCPURegisterName(StringRef Name) { int CC; CC = StringSwitch(Name) .Case("zero", 0) .Cases("at", "AT", 1) .Case("a0", 4) .Case("a1", 5) .Case("a2", 6) .Case("a3", 7) .Case("v0", 2) .Case("v1", 3) .Case("s0", 16) .Case("s1", 17) .Case("s2", 18) .Case("s3", 19) .Case("s4", 20) .Case("s5", 21) .Case("s6", 22) .Case("s7", 23) .Case("k0", 26) .Case("k1", 27) .Case("gp", 28) .Case("sp", 29) .Case("fp", 30) .Case("s8", 30) .Case("ra", 31) .Case("t0", 8) .Case("t1", 9) .Case("t2", 10) .Case("t3", 11) .Case("t4", 12) .Case("t5", 13) .Case("t6", 14) .Case("t7", 15) .Case("t8", 24) .Case("t9", 25) .Default(-1); if (!(isABI_N32() || isABI_N64())) return CC; if (12 <= CC && CC <= 15) { // Name is one of t4-t7 AsmToken RegTok = getLexer().peekTok(); SMRange RegRange = RegTok.getLocRange(); StringRef FixedName = StringSwitch(Name) .Case("t4", "t0") .Case("t5", "t1") .Case("t6", "t2") .Case("t7", "t3") .Default(""); assert(FixedName != "" && "Register name is not one of t4-t7."); printWarningWithFixIt("register names $t4-$t7 are only available in O32.", "Did you mean $" + FixedName + "?", RegRange); } // Although SGI documentation just cuts out t0-t3 for n32/n64, // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7 // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7. if (8 <= CC && CC <= 11) CC += 4; if (CC == -1) CC = StringSwitch(Name) .Case("a4", 8) .Case("a5", 9) .Case("a6", 10) .Case("a7", 11) .Case("kt0", 26) .Case("kt1", 27) .Default(-1); return CC; } int MipsAsmParser::matchHWRegsRegisterName(StringRef Name) { int CC; CC = StringSwitch(Name) .Case("hwr_cpunum", 0) .Case("hwr_synci_step", 1) .Case("hwr_cc", 2) .Case("hwr_ccres", 3) .Case("hwr_ulr", 29) .Default(-1); return CC; } int MipsAsmParser::matchFPURegisterName(StringRef Name) { if (Name[0] == 'f') { StringRef NumString = Name.substr(1); unsigned IntVal; if (NumString.getAsInteger(10, IntVal)) return -1; // This is not an integer. if (IntVal > 31) // Maximum index for fpu register. return -1; return IntVal; } return -1; } int MipsAsmParser::matchFCCRegisterName(StringRef Name) { if (Name.startswith("fcc")) { StringRef NumString = Name.substr(3); unsigned IntVal; if (NumString.getAsInteger(10, IntVal)) return -1; // This is not an integer. if (IntVal > 7) // There are only 8 fcc registers. return -1; return IntVal; } return -1; } int MipsAsmParser::matchACRegisterName(StringRef Name) { if (Name.startswith("ac")) { StringRef NumString = Name.substr(2); unsigned IntVal; if (NumString.getAsInteger(10, IntVal)) return -1; // This is not an integer. if (IntVal > 3) // There are only 3 acc registers. return -1; return IntVal; } return -1; } int MipsAsmParser::matchMSA128RegisterName(StringRef Name) { unsigned IntVal; if (Name.front() != 'w' || Name.drop_front(1).getAsInteger(10, IntVal)) return -1; if (IntVal > 31) return -1; return IntVal; } int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) { int CC; CC = StringSwitch(Name) .Case("msair", 0) .Case("msacsr", 1) .Case("msaaccess", 2) .Case("msasave", 3) .Case("msamodify", 4) .Case("msarequest", 5) .Case("msamap", 6) .Case("msaunmap", 7) .Default(-1); return CC; } bool MipsAsmParser::canUseATReg() { return AssemblerOptions.back()->getATRegIndex() != 0; } unsigned MipsAsmParser::getATReg(SMLoc Loc) { unsigned ATIndex = AssemblerOptions.back()->getATRegIndex(); if (ATIndex == 0) { reportParseError(Loc, "pseudo-instruction requires $at, which is not available"); return 0; } unsigned AT = getReg( (isGP64bit()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, ATIndex); return AT; } unsigned MipsAsmParser::getReg(int RC, int RegNo) { return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo); } bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { MCAsmParser &Parser = getParser(); LLVM_DEBUG(dbgs() << "parseOperand\n"); // Check if the current operand has a custom associated parser, if so, try to // custom parse the operand, or fallback to the general approach. OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); if (ResTy == MatchOperand_Success) return false; // If there wasn't a custom match, try the generic matcher below. Otherwise, // there was a match, but an error occurred, in which case, just return that // the operand parsing failed. if (ResTy == MatchOperand_ParseFail) return true; LLVM_DEBUG(dbgs() << ".. Generic Parser\n"); switch (getLexer().getKind()) { case AsmToken::Dollar: { // Parse the register. SMLoc S = Parser.getTok().getLoc(); // Almost all registers have been parsed by custom parsers. There is only // one exception to this. $zero (and it's alias $0) will reach this point // for div, divu, and similar instructions because it is not an operand // to the instruction definition but an explicit register. Special case // this situation for now. if (parseAnyRegister(Operands) != MatchOperand_NoMatch) return false; // Maybe it is a symbol reference. StringRef Identifier; if (Parser.parseIdentifier(Identifier)) return true; SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); MCSymbol *Sym = getContext().getOrCreateSymbol("$" + Identifier); // Otherwise create a symbol reference. const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); Operands.push_back(MipsOperand::CreateImm(Res, S, E, *this)); return false; } default: { LLVM_DEBUG(dbgs() << ".. generic integer expression\n"); const MCExpr *Expr; SMLoc S = Parser.getTok().getLoc(); // Start location of the operand. if (getParser().parseExpression(Expr)) return true; SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); Operands.push_back(MipsOperand::CreateImm(Expr, S, E, *this)); return false; } } // switch(getLexer().getKind()) return true; } bool MipsAsmParser::isEvaluated(const MCExpr *Expr) { switch (Expr->getKind()) { case MCExpr::Constant: return true; case MCExpr::SymbolRef: return (cast(Expr)->getKind() != MCSymbolRefExpr::VK_None); case MCExpr::Binary: { const MCBinaryExpr *BE = cast(Expr); if (!isEvaluated(BE->getLHS())) return false; return isEvaluated(BE->getRHS()); } case MCExpr::Unary: return isEvaluated(cast(Expr)->getSubExpr()); case MCExpr::Target: return true; } return false; } bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { SmallVector, 1> Operands; OperandMatchResultTy ResTy = parseAnyRegister(Operands); if (ResTy == MatchOperand_Success) { assert(Operands.size() == 1); MipsOperand &Operand = static_cast(*Operands.front()); StartLoc = Operand.getStartLoc(); EndLoc = Operand.getEndLoc(); // AFAIK, we only support numeric registers and named GPR's in CFI // directives. // Don't worry about eating tokens before failing. Using an unrecognised // register is a parse error. if (Operand.isGPRAsmReg()) { // Resolve to GPR32 or GPR64 appropriately. RegNo = isGP64bit() ? Operand.getGPR64Reg() : Operand.getGPR32Reg(); } return (RegNo == (unsigned)-1); } assert(Operands.size() == 0); return (RegNo == (unsigned)-1); } bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) { SMLoc S; if (isParenExpr) return getParser().parseParenExprOfDepth(0, Res, S); return getParser().parseExpression(Res); } OperandMatchResultTy MipsAsmParser::parseMemOperand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); LLVM_DEBUG(dbgs() << "parseMemOperand\n"); const MCExpr *IdVal = nullptr; SMLoc S; bool isParenExpr = false; OperandMatchResultTy Res = MatchOperand_NoMatch; // First operand is the offset. S = Parser.getTok().getLoc(); if (getLexer().getKind() == AsmToken::LParen) { Parser.Lex(); isParenExpr = true; } if (getLexer().getKind() != AsmToken::Dollar) { if (parseMemOffset(IdVal, isParenExpr)) return MatchOperand_ParseFail; const AsmToken &Tok = Parser.getTok(); // Get the next token. if (Tok.isNot(AsmToken::LParen)) { MipsOperand &Mnemonic = static_cast(*Operands[0]); if (Mnemonic.getToken() == "la" || Mnemonic.getToken() == "dla") { SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this)); return MatchOperand_Success; } if (Tok.is(AsmToken::EndOfStatement)) { SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); // Zero register assumed, add a memory operand with ZERO as its base. // "Base" will be managed by k_Memory. auto Base = MipsOperand::createGPRReg( 0, "0", getContext().getRegisterInfo(), S, E, *this); Operands.push_back( MipsOperand::CreateMem(std::move(Base), IdVal, S, E, *this)); return MatchOperand_Success; } MCBinaryExpr::Opcode Opcode; // GAS and LLVM treat comparison operators different. GAS will generate -1 // or 0, while LLVM will generate 0 or 1. Since a comparsion operator is // highly unlikely to be found in a memory offset expression, we don't // handle them. switch (Tok.getKind()) { case AsmToken::Plus: Opcode = MCBinaryExpr::Add; Parser.Lex(); break; case AsmToken::Minus: Opcode = MCBinaryExpr::Sub; Parser.Lex(); break; case AsmToken::Star: Opcode = MCBinaryExpr::Mul; Parser.Lex(); break; case AsmToken::Pipe: Opcode = MCBinaryExpr::Or; Parser.Lex(); break; case AsmToken::Amp: Opcode = MCBinaryExpr::And; Parser.Lex(); break; case AsmToken::LessLess: Opcode = MCBinaryExpr::Shl; Parser.Lex(); break; case AsmToken::GreaterGreater: Opcode = MCBinaryExpr::LShr; Parser.Lex(); break; case AsmToken::Caret: Opcode = MCBinaryExpr::Xor; Parser.Lex(); break; case AsmToken::Slash: Opcode = MCBinaryExpr::Div; Parser.Lex(); break; case AsmToken::Percent: Opcode = MCBinaryExpr::Mod; Parser.Lex(); break; default: Error(Parser.getTok().getLoc(), "'(' or expression expected"); return MatchOperand_ParseFail; } const MCExpr * NextExpr; if (getParser().parseExpression(NextExpr)) return MatchOperand_ParseFail; IdVal = MCBinaryExpr::create(Opcode, IdVal, NextExpr, getContext()); } Parser.Lex(); // Eat the '(' token. } Res = parseAnyRegister(Operands); if (Res != MatchOperand_Success) return Res; if (Parser.getTok().isNot(AsmToken::RParen)) { Error(Parser.getTok().getLoc(), "')' expected"); return MatchOperand_ParseFail; } SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); Parser.Lex(); // Eat the ')' token. if (!IdVal) IdVal = MCConstantExpr::create(0, getContext()); // Replace the register operand with the memory operand. std::unique_ptr op( static_cast(Operands.back().release())); // Remove the register from the operands. // "op" will be managed by k_Memory. Operands.pop_back(); // Add the memory operand. if (const MCBinaryExpr *BE = dyn_cast(IdVal)) { int64_t Imm; if (IdVal->evaluateAsAbsolute(Imm)) IdVal = MCConstantExpr::create(Imm, getContext()); else if (BE->getLHS()->getKind() != MCExpr::SymbolRef) IdVal = MCBinaryExpr::create(BE->getOpcode(), BE->getRHS(), BE->getLHS(), getContext()); } Operands.push_back(MipsOperand::CreateMem(std::move(op), IdVal, S, E, *this)); return MatchOperand_Success; } bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) { MCAsmParser &Parser = getParser(); MCSymbol *Sym = getContext().lookupSymbol(Parser.getTok().getIdentifier()); if (!Sym) return false; SMLoc S = Parser.getTok().getLoc(); if (Sym->isVariable()) { const MCExpr *Expr = Sym->getVariableValue(); if (Expr->getKind() == MCExpr::SymbolRef) { const MCSymbolRefExpr *Ref = static_cast(Expr); StringRef DefSymbol = Ref->getSymbol().getName(); if (DefSymbol.startswith("$")) { OperandMatchResultTy ResTy = matchAnyRegisterNameWithoutDollar(Operands, DefSymbol.substr(1), S); if (ResTy == MatchOperand_Success) { Parser.Lex(); return true; } if (ResTy == MatchOperand_ParseFail) llvm_unreachable("Should never ParseFail"); } } } else if (Sym->isUnset()) { // If symbol is unset, it might be created in the `parseSetAssignment` // routine as an alias for a numeric register name. // Lookup in the aliases list. auto Entry = RegisterSets.find(Sym->getName()); if (Entry != RegisterSets.end()) { OperandMatchResultTy ResTy = matchAnyRegisterWithoutDollar(Operands, Entry->getValue(), S); if (ResTy == MatchOperand_Success) { Parser.Lex(); return true; } } } return false; } OperandMatchResultTy MipsAsmParser::matchAnyRegisterNameWithoutDollar(OperandVector &Operands, StringRef Identifier, SMLoc S) { int Index = matchCPURegisterName(Identifier); if (Index != -1) { Operands.push_back(MipsOperand::createGPRReg( Index, Identifier, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this)); return MatchOperand_Success; } Index = matchHWRegsRegisterName(Identifier); if (Index != -1) { Operands.push_back(MipsOperand::createHWRegsReg( Index, Identifier, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this)); return MatchOperand_Success; } Index = matchFPURegisterName(Identifier); if (Index != -1) { Operands.push_back(MipsOperand::createFGRReg( Index, Identifier, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this)); return MatchOperand_Success; } Index = matchFCCRegisterName(Identifier); if (Index != -1) { Operands.push_back(MipsOperand::createFCCReg( Index, Identifier, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this)); return MatchOperand_Success; } Index = matchACRegisterName(Identifier); if (Index != -1) { Operands.push_back(MipsOperand::createACCReg( Index, Identifier, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this)); return MatchOperand_Success; } Index = matchMSA128RegisterName(Identifier); if (Index != -1) { Operands.push_back(MipsOperand::createMSA128Reg( Index, Identifier, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this)); return MatchOperand_Success; } Index = matchMSA128CtrlRegisterName(Identifier); if (Index != -1) { Operands.push_back(MipsOperand::createMSACtrlReg( Index, Identifier, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this)); return MatchOperand_Success; } return MatchOperand_NoMatch; } OperandMatchResultTy MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, const AsmToken &Token, SMLoc S) { if (Token.is(AsmToken::Identifier)) { LLVM_DEBUG(dbgs() << ".. identifier\n"); StringRef Identifier = Token.getIdentifier(); OperandMatchResultTy ResTy = matchAnyRegisterNameWithoutDollar(Operands, Identifier, S); return ResTy; } else if (Token.is(AsmToken::Integer)) { LLVM_DEBUG(dbgs() << ".. integer\n"); int64_t RegNum = Token.getIntVal(); if (RegNum < 0 || RegNum > 31) { // Show the error, but treat invalid register // number as a normal one to continue parsing // and catch other possible errors. Error(getLexer().getLoc(), "invalid register number"); } Operands.push_back(MipsOperand::createNumericReg( RegNum, Token.getString(), getContext().getRegisterInfo(), S, Token.getLoc(), *this)); return MatchOperand_Success; } LLVM_DEBUG(dbgs() << Token.getKind() << "\n"); return MatchOperand_NoMatch; } OperandMatchResultTy MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) { auto Token = getLexer().peekTok(false); return matchAnyRegisterWithoutDollar(Operands, Token, S); } OperandMatchResultTy MipsAsmParser::parseAnyRegister(OperandVector &Operands) { MCAsmParser &Parser = getParser(); LLVM_DEBUG(dbgs() << "parseAnyRegister\n"); auto Token = Parser.getTok(); SMLoc S = Token.getLoc(); if (Token.isNot(AsmToken::Dollar)) { LLVM_DEBUG(dbgs() << ".. !$ -> try sym aliasing\n"); if (Token.is(AsmToken::Identifier)) { if (searchSymbolAlias(Operands)) return MatchOperand_Success; } LLVM_DEBUG(dbgs() << ".. !symalias -> NoMatch\n"); return MatchOperand_NoMatch; } LLVM_DEBUG(dbgs() << ".. $\n"); OperandMatchResultTy ResTy = matchAnyRegisterWithoutDollar(Operands, S); if (ResTy == MatchOperand_Success) { Parser.Lex(); // $ Parser.Lex(); // identifier } return ResTy; } OperandMatchResultTy MipsAsmParser::parseJumpTarget(OperandVector &Operands) { MCAsmParser &Parser = getParser(); LLVM_DEBUG(dbgs() << "parseJumpTarget\n"); SMLoc S = getLexer().getLoc(); // Registers are a valid target and have priority over symbols. OperandMatchResultTy ResTy = parseAnyRegister(Operands); if (ResTy != MatchOperand_NoMatch) return ResTy; // Integers and expressions are acceptable const MCExpr *Expr = nullptr; if (Parser.parseExpression(Expr)) { // We have no way of knowing if a symbol was consumed so we must ParseFail return MatchOperand_ParseFail; } Operands.push_back( MipsOperand::CreateImm(Expr, S, getLexer().getLoc(), *this)); return MatchOperand_Success; } OperandMatchResultTy MipsAsmParser::parseInvNum(OperandVector &Operands) { MCAsmParser &Parser = getParser(); const MCExpr *IdVal; // If the first token is '$' we may have register operand. We have to reject // cases where it is not a register. Complicating the matter is that // register names are not reserved across all ABIs. // Peek past the dollar to see if it's a register name for this ABI. SMLoc S = Parser.getTok().getLoc(); if (Parser.getTok().is(AsmToken::Dollar)) { return matchCPURegisterName(Parser.getLexer().peekTok().getString()) == -1 ? MatchOperand_ParseFail : MatchOperand_NoMatch; } if (getParser().parseExpression(IdVal)) return MatchOperand_ParseFail; const MCConstantExpr *MCE = dyn_cast(IdVal); if (!MCE) return MatchOperand_NoMatch; int64_t Val = MCE->getValue(); SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); Operands.push_back(MipsOperand::CreateImm( MCConstantExpr::create(0 - Val, getContext()), S, E, *this)); return MatchOperand_Success; } OperandMatchResultTy MipsAsmParser::parseRegisterList(OperandVector &Operands) { MCAsmParser &Parser = getParser(); SmallVector Regs; unsigned RegNo; unsigned PrevReg = Mips::NoRegister; bool RegRange = false; SmallVector, 8> TmpOperands; if (Parser.getTok().isNot(AsmToken::Dollar)) return MatchOperand_ParseFail; SMLoc S = Parser.getTok().getLoc(); while (parseAnyRegister(TmpOperands) == MatchOperand_Success) { SMLoc E = getLexer().getLoc(); MipsOperand &Reg = static_cast(*TmpOperands.back()); RegNo = isGP64bit() ? Reg.getGPR64Reg() : Reg.getGPR32Reg(); if (RegRange) { // Remove last register operand because registers from register range // should be inserted first. if ((isGP64bit() && RegNo == Mips::RA_64) || (!isGP64bit() && RegNo == Mips::RA)) { Regs.push_back(RegNo); } else { unsigned TmpReg = PrevReg + 1; while (TmpReg <= RegNo) { if ((((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) && !isGP64bit()) || (((TmpReg < Mips::S0_64) || (TmpReg > Mips::S7_64)) && isGP64bit())) { Error(E, "invalid register operand"); return MatchOperand_ParseFail; } PrevReg = TmpReg; Regs.push_back(TmpReg++); } } RegRange = false; } else { if ((PrevReg == Mips::NoRegister) && ((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) || (!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA)))) { Error(E, "$16 or $31 expected"); return MatchOperand_ParseFail; } else if (!(((RegNo == Mips::FP || RegNo == Mips::RA || (RegNo >= Mips::S0 && RegNo <= Mips::S7)) && !isGP64bit()) || ((RegNo == Mips::FP_64 || RegNo == Mips::RA_64 || (RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) && isGP64bit()))) { Error(E, "invalid register operand"); return MatchOperand_ParseFail; } else if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) && ((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) || (RegNo != Mips::FP_64 && RegNo != Mips::RA_64 && isGP64bit()))) { Error(E, "consecutive register numbers expected"); return MatchOperand_ParseFail; } Regs.push_back(RegNo); } if (Parser.getTok().is(AsmToken::Minus)) RegRange = true; if (!Parser.getTok().isNot(AsmToken::Minus) && !Parser.getTok().isNot(AsmToken::Comma)) { Error(E, "',' or '-' expected"); return MatchOperand_ParseFail; } Lex(); // Consume comma or minus if (Parser.getTok().isNot(AsmToken::Dollar)) break; PrevReg = RegNo; } SMLoc E = Parser.getTok().getLoc(); Operands.push_back(MipsOperand::CreateRegList(Regs, S, E, *this)); parseMemOperand(Operands); return MatchOperand_Success; } /// Sometimes (i.e. load/stores) the operand may be followed immediately by /// either this. /// ::= '(', register, ')' /// handle it before we iterate so we don't get tripped up by the lack of /// a comma. bool MipsAsmParser::parseParenSuffix(StringRef Name, OperandVector &Operands) { MCAsmParser &Parser = getParser(); if (getLexer().is(AsmToken::LParen)) { Operands.push_back( MipsOperand::CreateToken("(", getLexer().getLoc(), *this)); Parser.Lex(); if (parseOperand(Operands, Name)) { SMLoc Loc = getLexer().getLoc(); return Error(Loc, "unexpected token in argument list"); } if (Parser.getTok().isNot(AsmToken::RParen)) { SMLoc Loc = getLexer().getLoc(); return Error(Loc, "unexpected token, expected ')'"); } Operands.push_back( MipsOperand::CreateToken(")", getLexer().getLoc(), *this)); Parser.Lex(); } return false; } /// Sometimes (i.e. in MSA) the operand may be followed immediately by /// either one of these. /// ::= '[', register, ']' /// ::= '[', integer, ']' /// handle it before we iterate so we don't get tripped up by the lack of /// a comma. bool MipsAsmParser::parseBracketSuffix(StringRef Name, OperandVector &Operands) { MCAsmParser &Parser = getParser(); if (getLexer().is(AsmToken::LBrac)) { Operands.push_back( MipsOperand::CreateToken("[", getLexer().getLoc(), *this)); Parser.Lex(); if (parseOperand(Operands, Name)) { SMLoc Loc = getLexer().getLoc(); return Error(Loc, "unexpected token in argument list"); } if (Parser.getTok().isNot(AsmToken::RBrac)) { SMLoc Loc = getLexer().getLoc(); return Error(Loc, "unexpected token, expected ']'"); } Operands.push_back( MipsOperand::CreateToken("]", getLexer().getLoc(), *this)); Parser.Lex(); } return false; } static std::string MipsMnemonicSpellCheck(StringRef S, const FeatureBitset &FBS, unsigned VariantID = 0); bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { MCAsmParser &Parser = getParser(); LLVM_DEBUG(dbgs() << "ParseInstruction\n"); // We have reached first instruction, module directive are now forbidden. getTargetStreamer().forbidModuleDirective(); // Check if we have valid mnemonic if (!mnemonicIsValid(Name, 0)) { FeatureBitset FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); std::string Suggestion = MipsMnemonicSpellCheck(Name, FBS); return Error(NameLoc, "unknown instruction" + Suggestion); } // First operand in MCInst is instruction mnemonic. Operands.push_back(MipsOperand::CreateToken(Name, NameLoc, *this)); // Read the remaining operands. if (getLexer().isNot(AsmToken::EndOfStatement)) { // Read the first operand. if (parseOperand(Operands, Name)) { SMLoc Loc = getLexer().getLoc(); return Error(Loc, "unexpected token in argument list"); } if (getLexer().is(AsmToken::LBrac) && parseBracketSuffix(Name, Operands)) return true; // AFAIK, parenthesis suffixes are never on the first operand while (getLexer().is(AsmToken::Comma)) { Parser.Lex(); // Eat the comma. // Parse and remember the operand. if (parseOperand(Operands, Name)) { SMLoc Loc = getLexer().getLoc(); return Error(Loc, "unexpected token in argument list"); } // Parse bracket and parenthesis suffixes before we iterate if (getLexer().is(AsmToken::LBrac)) { if (parseBracketSuffix(Name, Operands)) return true; } else if (getLexer().is(AsmToken::LParen) && parseParenSuffix(Name, Operands)) return true; } } if (getLexer().isNot(AsmToken::EndOfStatement)) { SMLoc Loc = getLexer().getLoc(); return Error(Loc, "unexpected token in argument list"); } Parser.Lex(); // Consume the EndOfStatement. return false; } // FIXME: Given that these have the same name, these should both be // consistent on affecting the Parser. bool MipsAsmParser::reportParseError(Twine ErrorMsg) { SMLoc Loc = getLexer().getLoc(); return Error(Loc, ErrorMsg); } bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) { return Error(Loc, ErrorMsg); } bool MipsAsmParser::parseSetNoAtDirective() { MCAsmParser &Parser = getParser(); // Line should look like: ".set noat". // Set the $at register to $0. AssemblerOptions.back()->setATRegIndex(0); Parser.Lex(); // Eat "noat". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } getTargetStreamer().emitDirectiveSetNoAt(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetAtDirective() { // Line can be: ".set at", which sets $at to $1 // or ".set at=$reg", which sets $at to $reg. MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "at". if (getLexer().is(AsmToken::EndOfStatement)) { // No register was specified, so we set $at to $1. AssemblerOptions.back()->setATRegIndex(1); getTargetStreamer().emitDirectiveSetAt(); Parser.Lex(); // Consume the EndOfStatement. return false; } if (getLexer().isNot(AsmToken::Equal)) { reportParseError("unexpected token, expected equals sign"); return false; } Parser.Lex(); // Eat "=". if (getLexer().isNot(AsmToken::Dollar)) { if (getLexer().is(AsmToken::EndOfStatement)) { reportParseError("no register specified"); return false; } else { reportParseError("unexpected token, expected dollar sign '$'"); return false; } } Parser.Lex(); // Eat "$". // Find out what "reg" is. unsigned AtRegNo; const AsmToken &Reg = Parser.getTok(); if (Reg.is(AsmToken::Identifier)) { AtRegNo = matchCPURegisterName(Reg.getIdentifier()); } else if (Reg.is(AsmToken::Integer)) { AtRegNo = Reg.getIntVal(); } else { reportParseError("unexpected token, expected identifier or integer"); return false; } // Check if $reg is a valid register. If it is, set $at to $reg. if (!AssemblerOptions.back()->setATRegIndex(AtRegNo)) { reportParseError("invalid register"); return false; } Parser.Lex(); // Eat "reg". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } getTargetStreamer().emitDirectiveSetAtWithArg(AtRegNo); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetReorderDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } AssemblerOptions.back()->setReorder(); getTargetStreamer().emitDirectiveSetReorder(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetNoReorderDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } AssemblerOptions.back()->setNoReorder(); getTargetStreamer().emitDirectiveSetNoReorder(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetMacroDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } AssemblerOptions.back()->setMacro(); getTargetStreamer().emitDirectiveSetMacro(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetNoMacroDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } if (AssemblerOptions.back()->isReorder()) { reportParseError("`noreorder' must be set before `nomacro'"); return false; } AssemblerOptions.back()->setNoMacro(); getTargetStreamer().emitDirectiveSetNoMacro(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetMsaDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) return reportParseError("unexpected token, expected end of statement"); setFeatureBits(Mips::FeatureMSA, "msa"); getTargetStreamer().emitDirectiveSetMsa(); return false; } bool MipsAsmParser::parseSetNoMsaDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) return reportParseError("unexpected token, expected end of statement"); clearFeatureBits(Mips::FeatureMSA, "msa"); getTargetStreamer().emitDirectiveSetNoMsa(); return false; } bool MipsAsmParser::parseSetNoDspDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "nodsp". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } clearFeatureBits(Mips::FeatureDSP, "dsp"); getTargetStreamer().emitDirectiveSetNoDsp(); return false; } bool MipsAsmParser::parseSetMips16Directive() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "mips16". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } setFeatureBits(Mips::FeatureMips16, "mips16"); getTargetStreamer().emitDirectiveSetMips16(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetNoMips16Directive() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "nomips16". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } clearFeatureBits(Mips::FeatureMips16, "mips16"); getTargetStreamer().emitDirectiveSetNoMips16(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetFpDirective() { MCAsmParser &Parser = getParser(); MipsABIFlagsSection::FpABIKind FpAbiVal; // Line can be: .set fp=32 // .set fp=xx // .set fp=64 Parser.Lex(); // Eat fp token AsmToken Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Equal)) { reportParseError("unexpected token, expected equals sign '='"); return false; } Parser.Lex(); // Eat '=' token. Tok = Parser.getTok(); if (!parseFpABIValue(FpAbiVal, ".set")) return false; if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } getTargetStreamer().emitDirectiveSetFp(FpAbiVal); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetOddSPRegDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "oddspreg". if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); getTargetStreamer().emitDirectiveSetOddSPReg(); return false; } bool MipsAsmParser::parseSetNoOddSPRegDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "nooddspreg". if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); getTargetStreamer().emitDirectiveSetNoOddSPReg(); return false; } bool MipsAsmParser::parseSetMtDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "mt". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } setFeatureBits(Mips::FeatureMT, "mt"); getTargetStreamer().emitDirectiveSetMt(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetNoMtDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "nomt". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } clearFeatureBits(Mips::FeatureMT, "mt"); getTargetStreamer().emitDirectiveSetNoMt(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetNoCRCDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "nocrc". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } clearFeatureBits(Mips::FeatureCRC, "crc"); getTargetStreamer().emitDirectiveSetNoCRC(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetNoVirtDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "novirt". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } clearFeatureBits(Mips::FeatureVirt, "virt"); getTargetStreamer().emitDirectiveSetNoVirt(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetNoGINVDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat "noginv". // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } clearFeatureBits(Mips::FeatureGINV, "ginv"); getTargetStreamer().emitDirectiveSetNoGINV(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseSetPopDirective() { MCAsmParser &Parser = getParser(); SMLoc Loc = getLexer().getLoc(); Parser.Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return reportParseError("unexpected token, expected end of statement"); // Always keep an element on the options "stack" to prevent the user // from changing the initial options. This is how we remember them. if (AssemblerOptions.size() == 2) return reportParseError(Loc, ".set pop with no .set push"); MCSubtargetInfo &STI = copySTI(); AssemblerOptions.pop_back(); setAvailableFeatures( ComputeAvailableFeatures(AssemblerOptions.back()->getFeatures())); STI.setFeatureBits(AssemblerOptions.back()->getFeatures()); getTargetStreamer().emitDirectiveSetPop(); return false; } bool MipsAsmParser::parseSetPushDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return reportParseError("unexpected token, expected end of statement"); // Create a copy of the current assembler options environment and push it. AssemblerOptions.push_back( llvm::make_unique(AssemblerOptions.back().get())); getTargetStreamer().emitDirectiveSetPush(); return false; } bool MipsAsmParser::parseSetSoftFloatDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return reportParseError("unexpected token, expected end of statement"); setFeatureBits(Mips::FeatureSoftFloat, "soft-float"); getTargetStreamer().emitDirectiveSetSoftFloat(); return false; } bool MipsAsmParser::parseSetHardFloatDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return reportParseError("unexpected token, expected end of statement"); clearFeatureBits(Mips::FeatureSoftFloat, "soft-float"); getTargetStreamer().emitDirectiveSetHardFloat(); return false; } bool MipsAsmParser::parseSetAssignment() { StringRef Name; MCAsmParser &Parser = getParser(); if (Parser.parseIdentifier(Name)) return reportParseError("expected identifier after .set"); if (getLexer().isNot(AsmToken::Comma)) return reportParseError("unexpected token, expected comma"); Lex(); // Eat comma if (getLexer().is(AsmToken::Dollar) && getLexer().peekTok().is(AsmToken::Integer)) { // Parse assignment of a numeric register: // .set r1,$1 Parser.Lex(); // Eat $. RegisterSets[Name] = Parser.getTok(); Parser.Lex(); // Eat identifier. getContext().getOrCreateSymbol(Name); return false; } MCSymbol *Sym; const MCExpr *Value; if (MCParserUtils::parseAssignmentExpression(Name, /* allow_redef */ true, Parser, Sym, Value)) return true; Sym->setVariableValue(Value); return false; } bool MipsAsmParser::parseSetMips0Directive() { MCAsmParser &Parser = getParser(); Parser.Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return reportParseError("unexpected token, expected end of statement"); // Reset assembler options to their initial values. MCSubtargetInfo &STI = copySTI(); setAvailableFeatures( ComputeAvailableFeatures(AssemblerOptions.front()->getFeatures())); STI.setFeatureBits(AssemblerOptions.front()->getFeatures()); AssemblerOptions.back()->setFeatures(AssemblerOptions.front()->getFeatures()); getTargetStreamer().emitDirectiveSetMips0(); return false; } bool MipsAsmParser::parseSetArchDirective() { MCAsmParser &Parser = getParser(); Parser.Lex(); if (getLexer().isNot(AsmToken::Equal)) return reportParseError("unexpected token, expected equals sign"); Parser.Lex(); StringRef Arch; if (Parser.parseIdentifier(Arch)) return reportParseError("expected arch identifier"); StringRef ArchFeatureName = StringSwitch(Arch) .Case("mips1", "mips1") .Case("mips2", "mips2") .Case("mips3", "mips3") .Case("mips4", "mips4") .Case("mips5", "mips5") .Case("mips32", "mips32") .Case("mips32r2", "mips32r2") .Case("mips32r3", "mips32r3") .Case("mips32r5", "mips32r5") .Case("mips32r6", "mips32r6") .Case("mips64", "mips64") .Case("mips64r2", "mips64r2") .Case("mips64r3", "mips64r3") .Case("mips64r5", "mips64r5") .Case("mips64r6", "mips64r6") .Case("octeon", "cnmips") .Case("r4000", "mips3") // This is an implementation of Mips3. .Default(""); if (ArchFeatureName.empty()) return reportParseError("unsupported architecture"); if (ArchFeatureName == "mips64r6" && inMicroMipsMode()) return reportParseError("mips64r6 does not support microMIPS"); selectArch(ArchFeatureName); getTargetStreamer().emitDirectiveSetArch(Arch); return false; } bool MipsAsmParser::parseSetFeature(uint64_t Feature) { MCAsmParser &Parser = getParser(); Parser.Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return reportParseError("unexpected token, expected end of statement"); switch (Feature) { default: llvm_unreachable("Unimplemented feature"); case Mips::FeatureDSP: setFeatureBits(Mips::FeatureDSP, "dsp"); getTargetStreamer().emitDirectiveSetDsp(); break; case Mips::FeatureDSPR2: setFeatureBits(Mips::FeatureDSPR2, "dspr2"); getTargetStreamer().emitDirectiveSetDspr2(); break; case Mips::FeatureMicroMips: setFeatureBits(Mips::FeatureMicroMips, "micromips"); getTargetStreamer().emitDirectiveSetMicroMips(); break; case Mips::FeatureMips1: selectArch("mips1"); getTargetStreamer().emitDirectiveSetMips1(); break; case Mips::FeatureMips2: selectArch("mips2"); getTargetStreamer().emitDirectiveSetMips2(); break; case Mips::FeatureMips3: selectArch("mips3"); getTargetStreamer().emitDirectiveSetMips3(); break; case Mips::FeatureMips4: selectArch("mips4"); getTargetStreamer().emitDirectiveSetMips4(); break; case Mips::FeatureMips5: selectArch("mips5"); getTargetStreamer().emitDirectiveSetMips5(); break; case Mips::FeatureMips32: selectArch("mips32"); getTargetStreamer().emitDirectiveSetMips32(); break; case Mips::FeatureMips32r2: selectArch("mips32r2"); getTargetStreamer().emitDirectiveSetMips32R2(); break; case Mips::FeatureMips32r3: selectArch("mips32r3"); getTargetStreamer().emitDirectiveSetMips32R3(); break; case Mips::FeatureMips32r5: selectArch("mips32r5"); getTargetStreamer().emitDirectiveSetMips32R5(); break; case Mips::FeatureMips32r6: selectArch("mips32r6"); getTargetStreamer().emitDirectiveSetMips32R6(); break; case Mips::FeatureMips64: selectArch("mips64"); getTargetStreamer().emitDirectiveSetMips64(); break; case Mips::FeatureMips64r2: selectArch("mips64r2"); getTargetStreamer().emitDirectiveSetMips64R2(); break; case Mips::FeatureMips64r3: selectArch("mips64r3"); getTargetStreamer().emitDirectiveSetMips64R3(); break; case Mips::FeatureMips64r5: selectArch("mips64r5"); getTargetStreamer().emitDirectiveSetMips64R5(); break; case Mips::FeatureMips64r6: selectArch("mips64r6"); getTargetStreamer().emitDirectiveSetMips64R6(); break; case Mips::FeatureCRC: setFeatureBits(Mips::FeatureCRC, "crc"); getTargetStreamer().emitDirectiveSetCRC(); break; case Mips::FeatureVirt: setFeatureBits(Mips::FeatureVirt, "virt"); getTargetStreamer().emitDirectiveSetVirt(); break; case Mips::FeatureGINV: setFeatureBits(Mips::FeatureGINV, "ginv"); getTargetStreamer().emitDirectiveSetGINV(); break; } return false; } bool MipsAsmParser::eatComma(StringRef ErrorStr) { MCAsmParser &Parser = getParser(); if (getLexer().isNot(AsmToken::Comma)) { SMLoc Loc = getLexer().getLoc(); return Error(Loc, ErrorStr); } Parser.Lex(); // Eat the comma. return true; } // Used to determine if .cpload, .cprestore, and .cpsetup have any effect. // In this class, it is only used for .cprestore. // FIXME: Only keep track of IsPicEnabled in one place, instead of in both // MipsTargetELFStreamer and MipsAsmParser. bool MipsAsmParser::isPicAndNotNxxAbi() { return inPicMode() && !(isABI_N32() || isABI_N64()); } bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) { if (AssemblerOptions.back()->isReorder()) Warning(Loc, ".cpload should be inside a noreorder section"); if (inMips16Mode()) { reportParseError(".cpload is not supported in Mips16 mode"); return false; } SmallVector, 1> Reg; OperandMatchResultTy ResTy = parseAnyRegister(Reg); if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) { reportParseError("expected register containing function address"); return false; } MipsOperand &RegOpnd = static_cast(*Reg[0]); if (!RegOpnd.isGPRAsmReg()) { reportParseError(RegOpnd.getStartLoc(), "invalid register"); return false; } // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } getTargetStreamer().emitDirectiveCpLoad(RegOpnd.getGPR32Reg()); return false; } bool MipsAsmParser::parseDirectiveCpLocal(SMLoc Loc) { if (!isABI_N32() && !isABI_N64()) { reportParseError(".cplocal is allowed only in N32 or N64 mode"); return false; } SmallVector, 1> Reg; OperandMatchResultTy ResTy = parseAnyRegister(Reg); if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) { reportParseError("expected register containing global pointer"); return false; } MipsOperand &RegOpnd = static_cast(*Reg[0]); if (!RegOpnd.isGPRAsmReg()) { reportParseError(RegOpnd.getStartLoc(), "invalid register"); return false; } // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } getParser().Lex(); // Consume the EndOfStatement. unsigned NewReg = RegOpnd.getGPR32Reg(); if (IsPicEnabled) GPReg = NewReg; getTargetStreamer().emitDirectiveCpLocal(NewReg); return false; } bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) { MCAsmParser &Parser = getParser(); // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it // is used in non-PIC mode. if (inMips16Mode()) { reportParseError(".cprestore is not supported in Mips16 mode"); return false; } // Get the stack offset value. const MCExpr *StackOffset; int64_t StackOffsetVal; if (Parser.parseExpression(StackOffset)) { reportParseError("expected stack offset value"); return false; } if (!StackOffset->evaluateAsAbsolute(StackOffsetVal)) { reportParseError("stack offset is not an absolute expression"); return false; } if (StackOffsetVal < 0) { Warning(Loc, ".cprestore with negative stack offset has no effect"); IsCpRestoreSet = false; } else { IsCpRestoreSet = true; CpRestoreOffset = StackOffsetVal; } // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } if (!getTargetStreamer().emitDirectiveCpRestore( CpRestoreOffset, [&]() { return getATReg(Loc); }, Loc, STI)) return true; Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseDirectiveCPSetup() { MCAsmParser &Parser = getParser(); unsigned FuncReg; unsigned Save; bool SaveIsReg = true; SmallVector, 1> TmpReg; OperandMatchResultTy ResTy = parseAnyRegister(TmpReg); if (ResTy == MatchOperand_NoMatch) { reportParseError("expected register containing function address"); return false; } MipsOperand &FuncRegOpnd = static_cast(*TmpReg[0]); if (!FuncRegOpnd.isGPRAsmReg()) { reportParseError(FuncRegOpnd.getStartLoc(), "invalid register"); return false; } FuncReg = FuncRegOpnd.getGPR32Reg(); TmpReg.clear(); if (!eatComma("unexpected token, expected comma")) return true; ResTy = parseAnyRegister(TmpReg); if (ResTy == MatchOperand_NoMatch) { const MCExpr *OffsetExpr; int64_t OffsetVal; SMLoc ExprLoc = getLexer().getLoc(); if (Parser.parseExpression(OffsetExpr) || !OffsetExpr->evaluateAsAbsolute(OffsetVal)) { reportParseError(ExprLoc, "expected save register or stack offset"); return false; } Save = OffsetVal; SaveIsReg = false; } else { MipsOperand &SaveOpnd = static_cast(*TmpReg[0]); if (!SaveOpnd.isGPRAsmReg()) { reportParseError(SaveOpnd.getStartLoc(), "invalid register"); return false; } Save = SaveOpnd.getGPR32Reg(); } if (!eatComma("unexpected token, expected comma")) return true; const MCExpr *Expr; if (Parser.parseExpression(Expr)) { reportParseError("expected expression"); return false; } if (Expr->getKind() != MCExpr::SymbolRef) { reportParseError("expected symbol"); return false; } const MCSymbolRefExpr *Ref = static_cast(Expr); CpSaveLocation = Save; CpSaveLocationIsRegister = SaveIsReg; getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, Ref->getSymbol(), SaveIsReg); return false; } bool MipsAsmParser::parseDirectiveCPReturn() { getTargetStreamer().emitDirectiveCpreturn(CpSaveLocation, CpSaveLocationIsRegister); return false; } bool MipsAsmParser::parseDirectiveNaN() { MCAsmParser &Parser = getParser(); if (getLexer().isNot(AsmToken::EndOfStatement)) { const AsmToken &Tok = Parser.getTok(); if (Tok.getString() == "2008") { Parser.Lex(); getTargetStreamer().emitDirectiveNaN2008(); return false; } else if (Tok.getString() == "legacy") { Parser.Lex(); getTargetStreamer().emitDirectiveNaNLegacy(); return false; } } // If we don't recognize the option passed to the .nan // directive (e.g. no option or unknown option), emit an error. reportParseError("invalid option in .nan directive"); return false; } bool MipsAsmParser::parseDirectiveSet() { const AsmToken &Tok = getParser().getTok(); StringRef IdVal = Tok.getString(); SMLoc Loc = Tok.getLoc(); if (IdVal == "noat") return parseSetNoAtDirective(); if (IdVal == "at") return parseSetAtDirective(); if (IdVal == "arch") return parseSetArchDirective(); if (IdVal == "bopt") { Warning(Loc, "'bopt' feature is unsupported"); getParser().Lex(); return false; } if (IdVal == "nobopt") { // We're already running in nobopt mode, so nothing to do. getParser().Lex(); return false; } if (IdVal == "fp") return parseSetFpDirective(); if (IdVal == "oddspreg") return parseSetOddSPRegDirective(); if (IdVal == "nooddspreg") return parseSetNoOddSPRegDirective(); if (IdVal == "pop") return parseSetPopDirective(); if (IdVal == "push") return parseSetPushDirective(); if (IdVal == "reorder") return parseSetReorderDirective(); if (IdVal == "noreorder") return parseSetNoReorderDirective(); if (IdVal == "macro") return parseSetMacroDirective(); if (IdVal == "nomacro") return parseSetNoMacroDirective(); if (IdVal == "mips16") return parseSetMips16Directive(); if (IdVal == "nomips16") return parseSetNoMips16Directive(); if (IdVal == "nomicromips") { clearFeatureBits(Mips::FeatureMicroMips, "micromips"); getTargetStreamer().emitDirectiveSetNoMicroMips(); getParser().eatToEndOfStatement(); return false; } if (IdVal == "micromips") { if (hasMips64r6()) { Error(Loc, ".set micromips directive is not supported with MIPS64R6"); return false; } return parseSetFeature(Mips::FeatureMicroMips); } if (IdVal == "mips0") return parseSetMips0Directive(); if (IdVal == "mips1") return parseSetFeature(Mips::FeatureMips1); if (IdVal == "mips2") return parseSetFeature(Mips::FeatureMips2); if (IdVal == "mips3") return parseSetFeature(Mips::FeatureMips3); if (IdVal == "mips4") return parseSetFeature(Mips::FeatureMips4); if (IdVal == "mips5") return parseSetFeature(Mips::FeatureMips5); if (IdVal == "mips32") return parseSetFeature(Mips::FeatureMips32); if (IdVal == "mips32r2") return parseSetFeature(Mips::FeatureMips32r2); if (IdVal == "mips32r3") return parseSetFeature(Mips::FeatureMips32r3); if (IdVal == "mips32r5") return parseSetFeature(Mips::FeatureMips32r5); if (IdVal == "mips32r6") return parseSetFeature(Mips::FeatureMips32r6); if (IdVal == "mips64") return parseSetFeature(Mips::FeatureMips64); if (IdVal == "mips64r2") return parseSetFeature(Mips::FeatureMips64r2); if (IdVal == "mips64r3") return parseSetFeature(Mips::FeatureMips64r3); if (IdVal == "mips64r5") return parseSetFeature(Mips::FeatureMips64r5); if (IdVal == "mips64r6") { if (inMicroMipsMode()) { Error(Loc, "MIPS64R6 is not supported with microMIPS"); return false; } return parseSetFeature(Mips::FeatureMips64r6); } if (IdVal == "dsp") return parseSetFeature(Mips::FeatureDSP); if (IdVal == "dspr2") return parseSetFeature(Mips::FeatureDSPR2); if (IdVal == "nodsp") return parseSetNoDspDirective(); if (IdVal == "msa") return parseSetMsaDirective(); if (IdVal == "nomsa") return parseSetNoMsaDirective(); if (IdVal == "mt") return parseSetMtDirective(); if (IdVal == "nomt") return parseSetNoMtDirective(); if (IdVal == "softfloat") return parseSetSoftFloatDirective(); if (IdVal == "hardfloat") return parseSetHardFloatDirective(); if (IdVal == "crc") return parseSetFeature(Mips::FeatureCRC); if (IdVal == "nocrc") return parseSetNoCRCDirective(); if (IdVal == "virt") return parseSetFeature(Mips::FeatureVirt); if (IdVal == "novirt") return parseSetNoVirtDirective(); if (IdVal == "ginv") return parseSetFeature(Mips::FeatureGINV); if (IdVal == "noginv") return parseSetNoGINVDirective(); // It is just an identifier, look for an assignment. return parseSetAssignment(); } /// parseDirectiveGpWord /// ::= .gpword local_sym bool MipsAsmParser::parseDirectiveGpWord() { MCAsmParser &Parser = getParser(); const MCExpr *Value; // EmitGPRel32Value requires an expression, so we are using base class // method to evaluate the expression. if (getParser().parseExpression(Value)) return true; getParser().getStreamer().EmitGPRel32Value(Value); if (getLexer().isNot(AsmToken::EndOfStatement)) return Error(getLexer().getLoc(), "unexpected token, expected end of statement"); Parser.Lex(); // Eat EndOfStatement token. return false; } /// parseDirectiveGpDWord /// ::= .gpdword local_sym bool MipsAsmParser::parseDirectiveGpDWord() { MCAsmParser &Parser = getParser(); const MCExpr *Value; // EmitGPRel64Value requires an expression, so we are using base class // method to evaluate the expression. if (getParser().parseExpression(Value)) return true; getParser().getStreamer().EmitGPRel64Value(Value); if (getLexer().isNot(AsmToken::EndOfStatement)) return Error(getLexer().getLoc(), "unexpected token, expected end of statement"); Parser.Lex(); // Eat EndOfStatement token. return false; } /// parseDirectiveDtpRelWord /// ::= .dtprelword tls_sym bool MipsAsmParser::parseDirectiveDtpRelWord() { MCAsmParser &Parser = getParser(); const MCExpr *Value; // EmitDTPRel32Value requires an expression, so we are using base class // method to evaluate the expression. if (getParser().parseExpression(Value)) return true; getParser().getStreamer().EmitDTPRel32Value(Value); if (getLexer().isNot(AsmToken::EndOfStatement)) return Error(getLexer().getLoc(), "unexpected token, expected end of statement"); Parser.Lex(); // Eat EndOfStatement token. return false; } /// parseDirectiveDtpRelDWord /// ::= .dtpreldword tls_sym bool MipsAsmParser::parseDirectiveDtpRelDWord() { MCAsmParser &Parser = getParser(); const MCExpr *Value; // EmitDTPRel64Value requires an expression, so we are using base class // method to evaluate the expression. if (getParser().parseExpression(Value)) return true; getParser().getStreamer().EmitDTPRel64Value(Value); if (getLexer().isNot(AsmToken::EndOfStatement)) return Error(getLexer().getLoc(), "unexpected token, expected end of statement"); Parser.Lex(); // Eat EndOfStatement token. return false; } /// parseDirectiveTpRelWord /// ::= .tprelword tls_sym bool MipsAsmParser::parseDirectiveTpRelWord() { MCAsmParser &Parser = getParser(); const MCExpr *Value; // EmitTPRel32Value requires an expression, so we are using base class // method to evaluate the expression. if (getParser().parseExpression(Value)) return true; getParser().getStreamer().EmitTPRel32Value(Value); if (getLexer().isNot(AsmToken::EndOfStatement)) return Error(getLexer().getLoc(), "unexpected token, expected end of statement"); Parser.Lex(); // Eat EndOfStatement token. return false; } /// parseDirectiveTpRelDWord /// ::= .tpreldword tls_sym bool MipsAsmParser::parseDirectiveTpRelDWord() { MCAsmParser &Parser = getParser(); const MCExpr *Value; // EmitTPRel64Value requires an expression, so we are using base class // method to evaluate the expression. if (getParser().parseExpression(Value)) return true; getParser().getStreamer().EmitTPRel64Value(Value); if (getLexer().isNot(AsmToken::EndOfStatement)) return Error(getLexer().getLoc(), "unexpected token, expected end of statement"); Parser.Lex(); // Eat EndOfStatement token. return false; } bool MipsAsmParser::parseDirectiveOption() { MCAsmParser &Parser = getParser(); // Get the option token. AsmToken Tok = Parser.getTok(); // At the moment only identifiers are supported. if (Tok.isNot(AsmToken::Identifier)) { return Error(Parser.getTok().getLoc(), "unexpected token, expected identifier"); } StringRef Option = Tok.getIdentifier(); if (Option == "pic0") { // MipsAsmParser needs to know if the current PIC mode changes. IsPicEnabled = false; getTargetStreamer().emitDirectiveOptionPic0(); Parser.Lex(); if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { return Error(Parser.getTok().getLoc(), "unexpected token, expected end of statement"); } return false; } if (Option == "pic2") { // MipsAsmParser needs to know if the current PIC mode changes. IsPicEnabled = true; getTargetStreamer().emitDirectiveOptionPic2(); Parser.Lex(); if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { return Error(Parser.getTok().getLoc(), "unexpected token, expected end of statement"); } return false; } // Unknown option. Warning(Parser.getTok().getLoc(), "unknown option, expected 'pic0' or 'pic2'"); Parser.eatToEndOfStatement(); return false; } /// parseInsnDirective /// ::= .insn bool MipsAsmParser::parseInsnDirective() { // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } // The actual label marking happens in // MipsELFStreamer::createPendingLabelRelocs(). getTargetStreamer().emitDirectiveInsn(); getParser().Lex(); // Eat EndOfStatement token. return false; } /// parseRSectionDirective /// ::= .rdata bool MipsAsmParser::parseRSectionDirective(StringRef Section) { // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } MCSection *ELFSection = getContext().getELFSection( Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC); getParser().getStreamer().SwitchSection(ELFSection); getParser().Lex(); // Eat EndOfStatement token. return false; } /// parseSSectionDirective /// ::= .sbss /// ::= .sdata bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) { // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } MCSection *ELFSection = getContext().getELFSection( Section, Type, ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_MIPS_GPREL); getParser().getStreamer().SwitchSection(ELFSection); getParser().Lex(); // Eat EndOfStatement token. return false; } /// parseDirectiveModule /// ::= .module oddspreg /// ::= .module nooddspreg /// ::= .module fp=value /// ::= .module softfloat /// ::= .module hardfloat /// ::= .module mt /// ::= .module crc /// ::= .module nocrc /// ::= .module virt /// ::= .module novirt /// ::= .module ginv /// ::= .module noginv bool MipsAsmParser::parseDirectiveModule() { MCAsmParser &Parser = getParser(); MCAsmLexer &Lexer = getLexer(); SMLoc L = Lexer.getLoc(); if (!getTargetStreamer().isModuleDirectiveAllowed()) { // TODO : get a better message. reportParseError(".module directive must appear before any code"); return false; } StringRef Option; if (Parser.parseIdentifier(Option)) { reportParseError("expected .module option identifier"); return false; } if (Option == "oddspreg") { clearModuleFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); // Synchronize the abiflags information with the FeatureBits information we // changed above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated abiflags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted at the end). getTargetStreamer().emitDirectiveModuleOddSPReg(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "nooddspreg") { if (!isABI_O32()) { return Error(L, "'.module nooddspreg' requires the O32 ABI"); } setModuleFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg"); // Synchronize the abiflags information with the FeatureBits information we // changed above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated abiflags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted at the end). getTargetStreamer().emitDirectiveModuleOddSPReg(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "fp") { return parseDirectiveModuleFP(); } else if (Option == "softfloat") { setModuleFeatureBits(Mips::FeatureSoftFloat, "soft-float"); // Synchronize the ABI Flags information with the FeatureBits information we // updated above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated ABI Flags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted later). getTargetStreamer().emitDirectiveModuleSoftFloat(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "hardfloat") { clearModuleFeatureBits(Mips::FeatureSoftFloat, "soft-float"); // Synchronize the ABI Flags information with the FeatureBits information we // updated above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated ABI Flags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted later). getTargetStreamer().emitDirectiveModuleHardFloat(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "mt") { setModuleFeatureBits(Mips::FeatureMT, "mt"); // Synchronize the ABI Flags information with the FeatureBits information we // updated above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated ABI Flags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted later). getTargetStreamer().emitDirectiveModuleMT(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "crc") { setModuleFeatureBits(Mips::FeatureCRC, "crc"); // Synchronize the ABI Flags information with the FeatureBits information we // updated above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated ABI Flags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted later). getTargetStreamer().emitDirectiveModuleCRC(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "nocrc") { clearModuleFeatureBits(Mips::FeatureCRC, "crc"); // Synchronize the ABI Flags information with the FeatureBits information we // updated above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated ABI Flags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted later). getTargetStreamer().emitDirectiveModuleNoCRC(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "virt") { setModuleFeatureBits(Mips::FeatureVirt, "virt"); // Synchronize the ABI Flags information with the FeatureBits information we // updated above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated ABI Flags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted later). getTargetStreamer().emitDirectiveModuleVirt(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "novirt") { clearModuleFeatureBits(Mips::FeatureVirt, "virt"); // Synchronize the ABI Flags information with the FeatureBits information we // updated above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated ABI Flags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted later). getTargetStreamer().emitDirectiveModuleNoVirt(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "ginv") { setModuleFeatureBits(Mips::FeatureGINV, "ginv"); // Synchronize the ABI Flags information with the FeatureBits information we // updated above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated ABI Flags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted later). getTargetStreamer().emitDirectiveModuleGINV(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else if (Option == "noginv") { clearModuleFeatureBits(Mips::FeatureGINV, "ginv"); // Synchronize the ABI Flags information with the FeatureBits information we // updated above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated ABI Flags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted later). getTargetStreamer().emitDirectiveModuleNoGINV(); // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } return false; // parseDirectiveModule has finished successfully. } else { return Error(L, "'" + Twine(Option) + "' is not a valid .module option."); } } /// parseDirectiveModuleFP /// ::= =32 /// ::= =xx /// ::= =64 bool MipsAsmParser::parseDirectiveModuleFP() { MCAsmParser &Parser = getParser(); MCAsmLexer &Lexer = getLexer(); if (Lexer.isNot(AsmToken::Equal)) { reportParseError("unexpected token, expected equals sign '='"); return false; } Parser.Lex(); // Eat '=' token. MipsABIFlagsSection::FpABIKind FpABI; if (!parseFpABIValue(FpABI, ".module")) return false; if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } // Synchronize the abiflags information with the FeatureBits information we // changed above. getTargetStreamer().updateABIInfo(*this); // If printing assembly, use the recently updated abiflags information. // If generating ELF, don't do anything (the .MIPS.abiflags section gets // emitted at the end). getTargetStreamer().emitDirectiveModuleFP(); Parser.Lex(); // Consume the EndOfStatement. return false; } bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI, StringRef Directive) { MCAsmParser &Parser = getParser(); MCAsmLexer &Lexer = getLexer(); bool ModuleLevelOptions = Directive == ".module"; if (Lexer.is(AsmToken::Identifier)) { StringRef Value = Parser.getTok().getString(); Parser.Lex(); if (Value != "xx") { reportParseError("unsupported value, expected 'xx', '32' or '64'"); return false; } if (!isABI_O32()) { reportParseError("'" + Directive + " fp=xx' requires the O32 ABI"); return false; } FpABI = MipsABIFlagsSection::FpABIKind::XX; if (ModuleLevelOptions) { setModuleFeatureBits(Mips::FeatureFPXX, "fpxx"); clearModuleFeatureBits(Mips::FeatureFP64Bit, "fp64"); } else { setFeatureBits(Mips::FeatureFPXX, "fpxx"); clearFeatureBits(Mips::FeatureFP64Bit, "fp64"); } return true; } if (Lexer.is(AsmToken::Integer)) { unsigned Value = Parser.getTok().getIntVal(); Parser.Lex(); if (Value != 32 && Value != 64) { reportParseError("unsupported value, expected 'xx', '32' or '64'"); return false; } if (Value == 32) { if (!isABI_O32()) { reportParseError("'" + Directive + " fp=32' requires the O32 ABI"); return false; } FpABI = MipsABIFlagsSection::FpABIKind::S32; if (ModuleLevelOptions) { clearModuleFeatureBits(Mips::FeatureFPXX, "fpxx"); clearModuleFeatureBits(Mips::FeatureFP64Bit, "fp64"); } else { clearFeatureBits(Mips::FeatureFPXX, "fpxx"); clearFeatureBits(Mips::FeatureFP64Bit, "fp64"); } } else { FpABI = MipsABIFlagsSection::FpABIKind::S64; if (ModuleLevelOptions) { clearModuleFeatureBits(Mips::FeatureFPXX, "fpxx"); setModuleFeatureBits(Mips::FeatureFP64Bit, "fp64"); } else { clearFeatureBits(Mips::FeatureFPXX, "fpxx"); setFeatureBits(Mips::FeatureFP64Bit, "fp64"); } } return true; } return false; } bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { // This returns false if this function recognizes the directive // regardless of whether it is successfully handles or reports an // error. Otherwise it returns true to give the generic parser a // chance at recognizing it. MCAsmParser &Parser = getParser(); StringRef IDVal = DirectiveID.getString(); if (IDVal == ".cpload") { parseDirectiveCpLoad(DirectiveID.getLoc()); return false; } if (IDVal == ".cprestore") { parseDirectiveCpRestore(DirectiveID.getLoc()); return false; } if (IDVal == ".cplocal") { parseDirectiveCpLocal(DirectiveID.getLoc()); return false; } if (IDVal == ".ent") { StringRef SymbolName; if (Parser.parseIdentifier(SymbolName)) { reportParseError("expected identifier after .ent"); return false; } // There's an undocumented extension that allows an integer to // follow the name of the procedure which AFAICS is ignored by GAS. // Example: .ent foo,2 if (getLexer().isNot(AsmToken::EndOfStatement)) { if (getLexer().isNot(AsmToken::Comma)) { // Even though we accept this undocumented extension for compatibility // reasons, the additional integer argument does not actually change // the behaviour of the '.ent' directive, so we would like to discourage // its use. We do this by not referring to the extended version in // error messages which are not directly related to its use. reportParseError("unexpected token, expected end of statement"); return false; } Parser.Lex(); // Eat the comma. const MCExpr *DummyNumber; int64_t DummyNumberVal; // If the user was explicitly trying to use the extended version, // we still give helpful extension-related error messages. if (Parser.parseExpression(DummyNumber)) { reportParseError("expected number after comma"); return false; } if (!DummyNumber->evaluateAsAbsolute(DummyNumberVal)) { reportParseError("expected an absolute expression after comma"); return false; } } // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName); getTargetStreamer().emitDirectiveEnt(*Sym); CurrentFn = Sym; IsCpRestoreSet = false; return false; } if (IDVal == ".end") { StringRef SymbolName; if (Parser.parseIdentifier(SymbolName)) { reportParseError("expected identifier after .end"); return false; } if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } if (CurrentFn == nullptr) { reportParseError(".end used without .ent"); return false; } if ((SymbolName != CurrentFn->getName())) { reportParseError(".end symbol does not match .ent symbol"); return false; } getTargetStreamer().emitDirectiveEnd(SymbolName); CurrentFn = nullptr; IsCpRestoreSet = false; return false; } if (IDVal == ".frame") { // .frame $stack_reg, frame_size_in_bytes, $return_reg SmallVector, 1> TmpReg; OperandMatchResultTy ResTy = parseAnyRegister(TmpReg); if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) { reportParseError("expected stack register"); return false; } MipsOperand &StackRegOpnd = static_cast(*TmpReg[0]); if (!StackRegOpnd.isGPRAsmReg()) { reportParseError(StackRegOpnd.getStartLoc(), "expected general purpose register"); return false; } unsigned StackReg = StackRegOpnd.getGPR32Reg(); if (Parser.getTok().is(AsmToken::Comma)) Parser.Lex(); else { reportParseError("unexpected token, expected comma"); return false; } // Parse the frame size. const MCExpr *FrameSize; int64_t FrameSizeVal; if (Parser.parseExpression(FrameSize)) { reportParseError("expected frame size value"); return false; } if (!FrameSize->evaluateAsAbsolute(FrameSizeVal)) { reportParseError("frame size not an absolute expression"); return false; } if (Parser.getTok().is(AsmToken::Comma)) Parser.Lex(); else { reportParseError("unexpected token, expected comma"); return false; } // Parse the return register. TmpReg.clear(); ResTy = parseAnyRegister(TmpReg); if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) { reportParseError("expected return register"); return false; } MipsOperand &ReturnRegOpnd = static_cast(*TmpReg[0]); if (!ReturnRegOpnd.isGPRAsmReg()) { reportParseError(ReturnRegOpnd.getStartLoc(), "expected general purpose register"); return false; } // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } getTargetStreamer().emitFrame(StackReg, FrameSizeVal, ReturnRegOpnd.getGPR32Reg()); IsCpRestoreSet = false; return false; } if (IDVal == ".set") { parseDirectiveSet(); return false; } if (IDVal == ".mask" || IDVal == ".fmask") { // .mask bitmask, frame_offset // bitmask: One bit for each register used. // frame_offset: Offset from Canonical Frame Address ($sp on entry) where // first register is expected to be saved. // Examples: // .mask 0x80000000, -4 // .fmask 0x80000000, -4 // // Parse the bitmask const MCExpr *BitMask; int64_t BitMaskVal; if (Parser.parseExpression(BitMask)) { reportParseError("expected bitmask value"); return false; } if (!BitMask->evaluateAsAbsolute(BitMaskVal)) { reportParseError("bitmask not an absolute expression"); return false; } if (Parser.getTok().is(AsmToken::Comma)) Parser.Lex(); else { reportParseError("unexpected token, expected comma"); return false; } // Parse the frame_offset const MCExpr *FrameOffset; int64_t FrameOffsetVal; if (Parser.parseExpression(FrameOffset)) { reportParseError("expected frame offset value"); return false; } if (!FrameOffset->evaluateAsAbsolute(FrameOffsetVal)) { reportParseError("frame offset not an absolute expression"); return false; } // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } if (IDVal == ".mask") getTargetStreamer().emitMask(BitMaskVal, FrameOffsetVal); else getTargetStreamer().emitFMask(BitMaskVal, FrameOffsetVal); return false; } if (IDVal == ".nan") return parseDirectiveNaN(); if (IDVal == ".gpword") { parseDirectiveGpWord(); return false; } if (IDVal == ".gpdword") { parseDirectiveGpDWord(); return false; } if (IDVal == ".dtprelword") { parseDirectiveDtpRelWord(); return false; } if (IDVal == ".dtpreldword") { parseDirectiveDtpRelDWord(); return false; } if (IDVal == ".tprelword") { parseDirectiveTpRelWord(); return false; } if (IDVal == ".tpreldword") { parseDirectiveTpRelDWord(); return false; } if (IDVal == ".option") { parseDirectiveOption(); return false; } if (IDVal == ".abicalls") { getTargetStreamer().emitDirectiveAbiCalls(); if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { Error(Parser.getTok().getLoc(), "unexpected token, expected end of statement"); } return false; } if (IDVal == ".cpsetup") { parseDirectiveCPSetup(); return false; } if (IDVal == ".cpreturn") { parseDirectiveCPReturn(); return false; } if (IDVal == ".module") { parseDirectiveModule(); return false; } if (IDVal == ".llvm_internal_mips_reallow_module_directive") { parseInternalDirectiveReallowModule(); return false; } if (IDVal == ".insn") { parseInsnDirective(); return false; } if (IDVal == ".rdata") { parseRSectionDirective(".rodata"); return false; } if (IDVal == ".sbss") { parseSSectionDirective(IDVal, ELF::SHT_NOBITS); return false; } if (IDVal == ".sdata") { parseSSectionDirective(IDVal, ELF::SHT_PROGBITS); return false; } return true; } bool MipsAsmParser::parseInternalDirectiveReallowModule() { // If this is not the end of the statement, report an error. if (getLexer().isNot(AsmToken::EndOfStatement)) { reportParseError("unexpected token, expected end of statement"); return false; } getTargetStreamer().reallowModuleDirective(); getParser().Lex(); // Eat EndOfStatement token. return false; } extern "C" void LLVMInitializeMipsAsmParser() { RegisterMCAsmParser X(getTheMipsTarget()); RegisterMCAsmParser Y(getTheMipselTarget()); RegisterMCAsmParser A(getTheMips64Target()); RegisterMCAsmParser B(getTheMips64elTarget()); } #define GET_REGISTER_MATCHER #define GET_MATCHER_IMPLEMENTATION #define GET_MNEMONIC_SPELL_CHECKER #include "MipsGenAsmMatcher.inc" bool MipsAsmParser::mnemonicIsValid(StringRef Mnemonic, unsigned VariantID) { // Find the appropriate table for this asm variant. const MatchEntry *Start, *End; switch (VariantID) { default: llvm_unreachable("invalid variant!"); case 0: Start = std::begin(MatchTable0); End = std::end(MatchTable0); break; } // Search the table. auto MnemonicRange = std::equal_range(Start, End, Mnemonic, LessOpcode()); return MnemonicRange.first != MnemonicRange.second; } Index: vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelDAGToDAG.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelDAGToDAG.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelDAGToDAG.cpp (revision 351709) @@ -1,289 +1,292 @@ //===-- RISCVISelDAGToDAG.cpp - A dag to dag inst selector for RISCV ------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines an instruction selector for the RISCV target. // //===----------------------------------------------------------------------===// #include "MCTargetDesc/RISCVMCTargetDesc.h" #include "RISCV.h" #include "RISCVTargetMachine.h" #include "Utils/RISCVMatInt.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "riscv-isel" // RISCV-specific code to select RISCV machine instructions for // SelectionDAG operations. namespace { class RISCVDAGToDAGISel final : public SelectionDAGISel { const RISCVSubtarget *Subtarget; public: explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine) : SelectionDAGISel(TargetMachine) {} StringRef getPassName() const override { return "RISCV DAG->DAG Pattern Instruction Selection"; } bool runOnMachineFunction(MachineFunction &MF) override { Subtarget = &MF.getSubtarget(); return SelectionDAGISel::runOnMachineFunction(MF); } void PostprocessISelDAG() override; void Select(SDNode *Node) override; bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector &OutOps) override; bool SelectAddrFI(SDValue Addr, SDValue &Base); // Include the pieces autogenerated from the target description. #include "RISCVGenDAGISel.inc" private: void doPeepholeLoadStoreADDI(); }; } void RISCVDAGToDAGISel::PostprocessISelDAG() { doPeepholeLoadStoreADDI(); } static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm, MVT XLenVT) { RISCVMatInt::InstSeq Seq; RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq); SDNode *Result; SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT); for (RISCVMatInt::Inst &Inst : Seq) { SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT); if (Inst.Opc == RISCV::LUI) Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm); else Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SDImm); // Only the first instruction has X0 as its source. SrcReg = SDValue(Result, 0); } return Result; } // Returns true if the Node is an ISD::AND with a constant argument. If so, // set Mask to that constant value. static bool isConstantMask(SDNode *Node, uint64_t &Mask) { if (Node->getOpcode() == ISD::AND && Node->getOperand(1).getOpcode() == ISD::Constant) { Mask = cast(Node->getOperand(1))->getZExtValue(); return true; } return false; } void RISCVDAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we have already selected. if (Node->isMachineOpcode()) { LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n"); Node->setNodeId(-1); return; } // Instruction Selection not handled by the auto-generated tablegen selection // should be handled here. unsigned Opcode = Node->getOpcode(); MVT XLenVT = Subtarget->getXLenVT(); SDLoc DL(Node); EVT VT = Node->getValueType(0); switch (Opcode) { case ISD::Constant: { auto ConstNode = cast(Node); if (VT == XLenVT && ConstNode->isNullValue()) { SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), RISCV::X0, XLenVT); ReplaceNode(Node, New.getNode()); return; } int64_t Imm = ConstNode->getSExtValue(); if (XLenVT == MVT::i64) { ReplaceNode(Node, selectImm(CurDAG, SDLoc(Node), Imm, XLenVT)); return; } break; } case ISD::FrameIndex: { SDValue Imm = CurDAG->getTargetConstant(0, DL, XLenVT); int FI = cast(Node)->getIndex(); SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT); ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm)); return; } case ISD::SRL: { if (!Subtarget->is64Bit()) break; SDValue Op0 = Node->getOperand(0); SDValue Op1 = Node->getOperand(1); uint64_t Mask; // Match (srl (and val, mask), imm) where the result would be a // zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result // is equivalent to this (SimplifyDemandedBits may have removed lower bits // from the mask that aren't necessary due to the right-shifting). if (Op1.getOpcode() == ISD::Constant && isConstantMask(Op0.getNode(), Mask)) { uint64_t ShAmt = cast(Op1.getNode())->getZExtValue(); if ((Mask | maskTrailingOnes(ShAmt)) == 0xffffffff) { SDValue ShAmtVal = CurDAG->getTargetConstant(ShAmt, SDLoc(Node), XLenVT); CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0.getOperand(0), ShAmtVal); return; } } break; } case RISCVISD::READ_CYCLE_WIDE: assert(!Subtarget->is64Bit() && "READ_CYCLE_WIDE is only used on riscv32"); ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ReadCycleWide, DL, MVT::i32, MVT::i32, MVT::Other, Node->getOperand(0))); return; } // Select the default instruction. SelectCode(Node); } bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand( const SDValue &Op, unsigned ConstraintID, std::vector &OutOps) { switch (ConstraintID) { case InlineAsm::Constraint_i: case InlineAsm::Constraint_m: // We just support simple memory operands that have a single address // operand and need no special handling. OutOps.push_back(Op); return false; + case InlineAsm::Constraint_A: + OutOps.push_back(Op); + return false; default: break; } return true; } bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) { if (auto FIN = dyn_cast(Addr)) { Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT()); return true; } return false; } // Merge an ADDI into the offset of a load/store instruction where possible. // (load (add base, off), 0) -> (load base, off) // (store val, (add base, off)) -> (store val, base, off) void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() { SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); ++Position; while (Position != CurDAG->allnodes_begin()) { SDNode *N = &*--Position; // Skip dead nodes and any non-machine opcodes. if (N->use_empty() || !N->isMachineOpcode()) continue; int OffsetOpIdx; int BaseOpIdx; // Only attempt this optimisation for I-type loads and S-type stores. switch (N->getMachineOpcode()) { default: continue; case RISCV::LB: case RISCV::LH: case RISCV::LW: case RISCV::LBU: case RISCV::LHU: case RISCV::LWU: case RISCV::LD: case RISCV::FLW: case RISCV::FLD: BaseOpIdx = 0; OffsetOpIdx = 1; break; case RISCV::SB: case RISCV::SH: case RISCV::SW: case RISCV::SD: case RISCV::FSW: case RISCV::FSD: BaseOpIdx = 1; OffsetOpIdx = 2; break; } // Currently, the load/store offset must be 0 to be considered for this // peephole optimisation. if (!isa(N->getOperand(OffsetOpIdx)) || N->getConstantOperandVal(OffsetOpIdx) != 0) continue; SDValue Base = N->getOperand(BaseOpIdx); // If the base is an ADDI, we can merge it in to the load/store. if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI) continue; SDValue ImmOperand = Base.getOperand(1); if (auto Const = dyn_cast(ImmOperand)) { ImmOperand = CurDAG->getTargetConstant( Const->getSExtValue(), SDLoc(ImmOperand), ImmOperand.getValueType()); } else if (auto GA = dyn_cast(ImmOperand)) { ImmOperand = CurDAG->getTargetGlobalAddress( GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(), GA->getOffset(), GA->getTargetFlags()); } else { continue; } LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: "); LLVM_DEBUG(Base->dump(CurDAG)); LLVM_DEBUG(dbgs() << "\nN: "); LLVM_DEBUG(N->dump(CurDAG)); LLVM_DEBUG(dbgs() << "\n"); // Modify the offset operand of the load/store. if (BaseOpIdx == 0) // Load CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand, N->getOperand(2)); else // Store CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0), ImmOperand, N->getOperand(3)); // The add-immediate may now be dead, in which case remove it. if (Base.getNode()->use_empty()) CurDAG->RemoveDeadNode(Base.getNode()); } } // This pass converts a legalized DAG into a RISCV-specific DAG, ready // for instruction scheduling. FunctionPass *llvm::createRISCVISelDag(RISCVTargetMachine &TM) { return new RISCVDAGToDAGISel(TM); } Index: vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.cpp (revision 351709) @@ -1,2648 +1,2665 @@ //===-- RISCVISelLowering.cpp - RISCV DAG Lowering Implementation --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that RISCV uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "RISCVISelLowering.h" #include "RISCV.h" #include "RISCVMachineFunctionInfo.h" #include "RISCVRegisterInfo.h" #include "RISCVSubtarget.h" #include "RISCVTargetMachine.h" #include "Utils/RISCVMatInt.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "riscv-lower" STATISTIC(NumTailCalls, "Number of tail calls"); RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, const RISCVSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { if (Subtarget.isRV32E()) report_fatal_error("Codegen not yet implemented for RV32E"); RISCVABI::ABI ABI = Subtarget.getTargetABI(); assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI"); switch (ABI) { default: report_fatal_error("Don't know how to lower this ABI"); case RISCVABI::ABI_ILP32: case RISCVABI::ABI_ILP32F: case RISCVABI::ABI_ILP32D: case RISCVABI::ABI_LP64: case RISCVABI::ABI_LP64F: case RISCVABI::ABI_LP64D: break; } MVT XLenVT = Subtarget.getXLenVT(); // Set up the register classes. addRegisterClass(XLenVT, &RISCV::GPRRegClass); if (Subtarget.hasStdExtF()) addRegisterClass(MVT::f32, &RISCV::FPR32RegClass); if (Subtarget.hasStdExtD()) addRegisterClass(MVT::f64, &RISCV::FPR64RegClass); // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); setStackPointerRegisterToSaveRestore(RISCV::X2); for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) setLoadExtAction(N, XLenVT, MVT::i1, Promote); // TODO: add all necessary setOperationAction calls. setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::BR_CC, XLenVT, Expand); setOperationAction(ISD::SELECT, XLenVT, Custom); setOperationAction(ISD::SELECT_CC, XLenVT, Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Expand); setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::VAEND, MVT::Other, Expand); for (auto VT : {MVT::i1, MVT::i8, MVT::i16}) setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (Subtarget.is64Bit()) { setOperationAction(ISD::SHL, MVT::i32, Custom); setOperationAction(ISD::SRA, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i32, Custom); } if (!Subtarget.hasStdExtM()) { setOperationAction(ISD::MUL, XLenVT, Expand); setOperationAction(ISD::MULHS, XLenVT, Expand); setOperationAction(ISD::MULHU, XLenVT, Expand); setOperationAction(ISD::SDIV, XLenVT, Expand); setOperationAction(ISD::UDIV, XLenVT, Expand); setOperationAction(ISD::SREM, XLenVT, Expand); setOperationAction(ISD::UREM, XLenVT, Expand); } if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) { setOperationAction(ISD::SDIV, MVT::i32, Custom); setOperationAction(ISD::UDIV, MVT::i32, Custom); setOperationAction(ISD::UREM, MVT::i32, Custom); } setOperationAction(ISD::SDIVREM, XLenVT, Expand); setOperationAction(ISD::UDIVREM, XLenVT, Expand); setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand); setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand); setOperationAction(ISD::SHL_PARTS, XLenVT, Custom); setOperationAction(ISD::SRL_PARTS, XLenVT, Custom); setOperationAction(ISD::SRA_PARTS, XLenVT, Custom); setOperationAction(ISD::ROTL, XLenVT, Expand); setOperationAction(ISD::ROTR, XLenVT, Expand); setOperationAction(ISD::BSWAP, XLenVT, Expand); setOperationAction(ISD::CTTZ, XLenVT, Expand); setOperationAction(ISD::CTLZ, XLenVT, Expand); setOperationAction(ISD::CTPOP, XLenVT, Expand); ISD::CondCode FPCCToExtend[] = { ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT, ISD::SETGE, ISD::SETNE}; ISD::NodeType FPOpToExtend[] = { ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM}; if (Subtarget.hasStdExtF()) { setOperationAction(ISD::FMINNUM, MVT::f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); for (auto CC : FPCCToExtend) setCondCodeAction(CC, MVT::f32, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Expand); for (auto Op : FPOpToExtend) setOperationAction(Op, MVT::f32, Expand); } if (Subtarget.hasStdExtF() && Subtarget.is64Bit()) setOperationAction(ISD::BITCAST, MVT::i32, Custom); if (Subtarget.hasStdExtD()) { setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); for (auto CC : FPCCToExtend) setCondCodeAction(CC, MVT::f64, Expand); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); for (auto Op : FPOpToExtend) setOperationAction(Op, MVT::f64, Expand); } setOperationAction(ISD::GlobalAddress, XLenVT, Custom); setOperationAction(ISD::BlockAddress, XLenVT, Custom); setOperationAction(ISD::ConstantPool, XLenVT, Custom); setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom); // TODO: On M-mode only targets, the cycle[h] CSR may not be present. // Unfortunately this can't be determined just from the ISA naming string. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Subtarget.is64Bit() ? Legal : Custom); if (Subtarget.hasStdExtA()) { setMaxAtomicSizeInBitsSupported(Subtarget.getXLen()); setMinCmpXchgSizeInBits(32); } else { setMaxAtomicSizeInBitsSupported(0); } setBooleanContents(ZeroOrOneBooleanContent); // Function alignments (log2). unsigned FunctionAlignment = Subtarget.hasStdExtC() ? 1 : 2; setMinFunctionAlignment(FunctionAlignment); setPrefFunctionAlignment(FunctionAlignment); // Effectively disable jump table generation. setMinimumJumpTableEntries(INT_MAX); } EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) return getPointerTy(DL); return VT.changeVectorElementTypeToInteger(); } bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { default: return false; case Intrinsic::riscv_masked_atomicrmw_xchg_i32: case Intrinsic::riscv_masked_atomicrmw_add_i32: case Intrinsic::riscv_masked_atomicrmw_sub_i32: case Intrinsic::riscv_masked_atomicrmw_nand_i32: case Intrinsic::riscv_masked_atomicrmw_max_i32: case Intrinsic::riscv_masked_atomicrmw_min_i32: case Intrinsic::riscv_masked_atomicrmw_umax_i32: case Intrinsic::riscv_masked_atomicrmw_umin_i32: case Intrinsic::riscv_masked_cmpxchg_i32: PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = 4; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } } bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const { // No global is ever allowed as a base. if (AM.BaseGV) return false; // Require a 12-bit signed offset. if (!isInt<12>(AM.BaseOffs)) return false; switch (AM.Scale) { case 0: // "r+i" or just "i", depending on HasBaseReg. break; case 1: if (!AM.HasBaseReg) // allow "r+i". break; return false; // disallow "r+r" or "r+r+i". default: return false; } return true; } bool RISCVTargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<12>(Imm); } bool RISCVTargetLowering::isLegalAddImmediate(int64_t Imm) const { return isInt<12>(Imm); } // On RV32, 64-bit integers are split into their high and low parts and held // in two different registers, so the trunc is free since the low register can // just be used. bool RISCVTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { if (Subtarget.is64Bit() || !SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) return false; unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); unsigned DestBits = DstTy->getPrimitiveSizeInBits(); return (SrcBits == 64 && DestBits == 32); } bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { if (Subtarget.is64Bit() || SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || !DstVT.isInteger()) return false; unsigned SrcBits = SrcVT.getSizeInBits(); unsigned DestBits = DstVT.getSizeInBits(); return (SrcBits == 64 && DestBits == 32); } bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { // Zexts are free if they can be combined with a load. if (auto *LD = dyn_cast(Val)) { EVT MemVT = LD->getMemoryVT(); if ((MemVT == MVT::i8 || MemVT == MVT::i16 || (Subtarget.is64Bit() && MemVT == MVT::i32)) && (LD->getExtensionType() == ISD::NON_EXTLOAD || LD->getExtensionType() == ISD::ZEXTLOAD)) return true; } return TargetLowering::isZExtFree(Val, VT2); } bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const { return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64; } bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const { return (VT == MVT::f32 && Subtarget.hasStdExtF()) || (VT == MVT::f64 && Subtarget.hasStdExtD()); } // Changes the condition code and swaps operands if necessary, so the SetCC // operation matches one of the comparisons supported directly in the RISC-V // ISA. static void normaliseSetCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) { switch (CC) { default: break; case ISD::SETGT: case ISD::SETLE: case ISD::SETUGT: case ISD::SETULE: CC = ISD::getSetCCSwappedOperands(CC); std::swap(LHS, RHS); break; } } // Return the RISC-V branch opcode that matches the given DAG integer // condition code. The CondCode must be one of those supported by the RISC-V // ISA (see normaliseSetCC). static unsigned getBranchOpcodeForIntCondCode(ISD::CondCode CC) { switch (CC) { default: llvm_unreachable("Unsupported CondCode"); case ISD::SETEQ: return RISCV::BEQ; case ISD::SETNE: return RISCV::BNE; case ISD::SETLT: return RISCV::BLT; case ISD::SETGE: return RISCV::BGE; case ISD::SETULT: return RISCV::BLTU; case ISD::SETUGE: return RISCV::BGEU; } } SDValue RISCVTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: report_fatal_error("unimplemented operand"); case ISD::GlobalAddress: return lowerGlobalAddress(Op, DAG); case ISD::BlockAddress: return lowerBlockAddress(Op, DAG); case ISD::ConstantPool: return lowerConstantPool(Op, DAG); case ISD::GlobalTLSAddress: return lowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return lowerSELECT(Op, DAG); case ISD::VASTART: return lowerVASTART(Op, DAG); case ISD::FRAMEADDR: return lowerFRAMEADDR(Op, DAG); case ISD::RETURNADDR: return lowerRETURNADDR(Op, DAG); case ISD::SHL_PARTS: return lowerShiftLeftParts(Op, DAG); case ISD::SRA_PARTS: return lowerShiftRightParts(Op, DAG, true); case ISD::SRL_PARTS: return lowerShiftRightParts(Op, DAG, false); case ISD::BITCAST: { assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() && "Unexpected custom legalisation"); SDLoc DL(Op); SDValue Op0 = Op.getOperand(0); if (Op.getValueType() != MVT::f32 || Op0.getValueType() != MVT::i32) return SDValue(); SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0); SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0); return FPConv; } } } static SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty, SelectionDAG &DAG, unsigned Flags) { return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags); } static SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty, SelectionDAG &DAG, unsigned Flags) { return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(), Flags); } static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty, SelectionDAG &DAG, unsigned Flags) { return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), N->getOffset(), Flags); } template SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal) const { SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); if (isPositionIndependent()) { SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0); if (IsLocal) // Use PC-relative addressing to access the symbol. This generates the // pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym)) // %pcrel_lo(auipc)). return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0); // Use PC-relative addressing to access the GOT for this symbol, then load // the address from the GOT. This generates the pattern (PseudoLA sym), // which expands to (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))). return SDValue(DAG.getMachineNode(RISCV::PseudoLA, DL, Ty, Addr), 0); } switch (getTargetMachine().getCodeModel()) { default: report_fatal_error("Unsupported code model for lowering"); case CodeModel::Small: { // Generate a sequence for accessing addresses within the first 2 GiB of // address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)). SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI); SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO); SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0); return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, AddrLo), 0); } case CodeModel::Medium: { // Generate a sequence for accessing addresses within any 2GiB range within // the address space. This generates the pattern (PseudoLLA sym), which // expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)). SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0); return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0); } } } SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT Ty = Op.getValueType(); GlobalAddressSDNode *N = cast(Op); int64_t Offset = N->getOffset(); MVT XLenVT = Subtarget.getXLenVT(); const GlobalValue *GV = N->getGlobal(); bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); SDValue Addr = getAddr(N, DAG, IsLocal); // In order to maximise the opportunity for common subexpression elimination, // emit a separate ADD node for the global address offset instead of folding // it in the global address node. Later peephole optimisations may choose to // fold it back in when profitable. if (Offset != 0) return DAG.getNode(ISD::ADD, DL, Ty, Addr, DAG.getConstant(Offset, DL, XLenVT)); return Addr; } SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { BlockAddressSDNode *N = cast(Op); return getAddr(N, DAG); } SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *N = cast(Op); return getAddr(N, DAG); } SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, bool UseGOT) const { SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); const GlobalValue *GV = N->getGlobal(); MVT XLenVT = Subtarget.getXLenVT(); if (UseGOT) { // Use PC-relative addressing to access the GOT for this TLS symbol, then // load the address from the GOT and add the thread pointer. This generates // the pattern (PseudoLA_TLS_IE sym), which expands to // (ld (auipc %tls_ie_pcrel_hi(sym)) %pcrel_lo(auipc)). SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0); SDValue Load = SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_IE, DL, Ty, Addr), 0); // Add the thread pointer. SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT); return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg); } // Generate a sequence for accessing the address relative to the thread // pointer, with the appropriate adjustment for the thread pointer offset. // This generates the pattern // (add (add_tprel (lui %tprel_hi(sym)) tp %tprel_add(sym)) %tprel_lo(sym)) SDValue AddrHi = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_HI); SDValue AddrAdd = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_ADD); SDValue AddrLo = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_LO); SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0); SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT); SDValue MNAdd = SDValue( DAG.getMachineNode(RISCV::PseudoAddTPRel, DL, Ty, MNHi, TPReg, AddrAdd), 0); return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNAdd, AddrLo), 0); } SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const { SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); IntegerType *CallTy = Type::getIntNTy(*DAG.getContext(), Ty.getSizeInBits()); const GlobalValue *GV = N->getGlobal(); // Use a PC-relative addressing mode to access the global dynamic GOT address. // This generates the pattern (PseudoLA_TLS_GD sym), which expands to // (addi (auipc %tls_gd_pcrel_hi(sym)) %pcrel_lo(auipc)). SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0); SDValue Load = SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_GD, DL, Ty, Addr), 0); // Prepare argument list to generate call. ArgListTy Args; ArgListEntry Entry; Entry.Node = Load; Entry.Ty = CallTy; Args.push_back(Entry); // Setup call to __tls_get_addr. TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL) .setChain(DAG.getEntryNode()) .setLibCallee(CallingConv::C, CallTy, DAG.getExternalSymbol("__tls_get_addr", Ty), std::move(Args)); return LowerCallTo(CLI).first; } SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT Ty = Op.getValueType(); GlobalAddressSDNode *N = cast(Op); int64_t Offset = N->getOffset(); MVT XLenVT = Subtarget.getXLenVT(); // Non-PIC TLS lowering should always use the LocalExec model. TLSModel::Model Model = isPositionIndependent() ? getTargetMachine().getTLSModel(N->getGlobal()) : TLSModel::LocalExec; SDValue Addr; switch (Model) { case TLSModel::LocalExec: Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/false); break; case TLSModel::InitialExec: Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/true); break; case TLSModel::LocalDynamic: case TLSModel::GeneralDynamic: Addr = getDynamicTLSAddr(N, DAG); break; } // In order to maximise the opportunity for common subexpression elimination, // emit a separate ADD node for the global address offset instead of folding // it in the global address node. Later peephole optimisations may choose to // fold it back in when profitable. if (Offset != 0) return DAG.getNode(ISD::ADD, DL, Ty, Addr, DAG.getConstant(Offset, DL, XLenVT)); return Addr; } SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue CondV = Op.getOperand(0); SDValue TrueV = Op.getOperand(1); SDValue FalseV = Op.getOperand(2); SDLoc DL(Op); MVT XLenVT = Subtarget.getXLenVT(); // If the result type is XLenVT and CondV is the output of a SETCC node // which also operated on XLenVT inputs, then merge the SETCC node into the // lowered RISCVISD::SELECT_CC to take advantage of the integer // compare+branch instructions. i.e.: // (select (setcc lhs, rhs, cc), truev, falsev) // -> (riscvisd::select_cc lhs, rhs, cc, truev, falsev) if (Op.getSimpleValueType() == XLenVT && CondV.getOpcode() == ISD::SETCC && CondV.getOperand(0).getSimpleValueType() == XLenVT) { SDValue LHS = CondV.getOperand(0); SDValue RHS = CondV.getOperand(1); auto CC = cast(CondV.getOperand(2)); ISD::CondCode CCVal = CC->get(); normaliseSetCC(LHS, RHS, CCVal); SDValue TargetCC = DAG.getConstant(CCVal, DL, XLenVT); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV}; return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops); } // Otherwise: // (select condv, truev, falsev) // -> (riscvisd::select_cc condv, zero, setne, truev, falsev) SDValue Zero = DAG.getConstant(0, DL, XLenVT); SDValue SetNE = DAG.getConstant(ISD::SETNE, DL, XLenVT); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV}; return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops); } SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); RISCVMachineFunctionInfo *FuncInfo = MF.getInfo(); SDLoc DL(Op); SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), getPointerTy(MF.getDataLayout())); // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setFrameAddressIsTaken(true); unsigned FrameReg = RI.getFrameRegister(MF); int XLenInBytes = Subtarget.getXLen() / 8; EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); while (Depth--) { int Offset = -(XLenInBytes * 2); SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr, DAG.getIntPtrConstant(Offset, DL)); FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo()); } return FrameAddr; } SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setReturnAddressIsTaken(true); MVT XLenVT = Subtarget.getXLenVT(); int XLenInBytes = Subtarget.getXLen() / 8; if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { int Off = -XLenInBytes; SDValue FrameAddr = lowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(Off, DL, VT); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); } // Return the value of the return address register, marking it an implicit // live-in. unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT)); return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT); } SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Shamt = Op.getOperand(2); EVT VT = Lo.getValueType(); // if Shamt-XLEN < 0: // Shamt < XLEN // Lo = Lo << Shamt // Hi = (Hi << Shamt) | ((Lo >>u 1) >>u (XLEN-1 - Shamt)) // else: // Lo = 0 // Hi = Lo << (Shamt-XLEN) SDValue Zero = DAG.getConstant(0, DL, VT); SDValue One = DAG.getConstant(1, DL, VT); SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT); SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT); SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen); SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt); SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt); SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One); SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, XLenMinus1Shamt); SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt); SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo); SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusXLen); SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT); Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero); Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse); SDValue Parts[2] = {Lo, Hi}; return DAG.getMergeValues(Parts, DL); } SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const { SDLoc DL(Op); SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); SDValue Shamt = Op.getOperand(2); EVT VT = Lo.getValueType(); // SRA expansion: // if Shamt-XLEN < 0: // Shamt < XLEN // Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt)) // Hi = Hi >>s Shamt // else: // Lo = Hi >>s (Shamt-XLEN); // Hi = Hi >>s (XLEN-1) // // SRL expansion: // if Shamt-XLEN < 0: // Shamt < XLEN // Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt)) // Hi = Hi >>u Shamt // else: // Lo = Hi >>u (Shamt-XLEN); // Hi = 0; unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL; SDValue Zero = DAG.getConstant(0, DL, VT); SDValue One = DAG.getConstant(1, DL, VT); SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT); SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT); SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen); SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt); SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt); SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One); SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, XLenMinus1Shamt); SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi); SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt); SDValue LoFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusXLen); SDValue HiFalse = IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, XLenMinus1) : Zero; SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT); Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse); Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse); SDValue Parts[2] = {Lo, Hi}; return DAG.getMergeValues(Parts, DL); } // Returns the opcode of the target-specific SDNode that implements the 32-bit // form of the given Opcode. static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) { switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); case ISD::SHL: return RISCVISD::SLLW; case ISD::SRA: return RISCVISD::SRAW; case ISD::SRL: return RISCVISD::SRLW; case ISD::SDIV: return RISCVISD::DIVW; case ISD::UDIV: return RISCVISD::DIVUW; case ISD::UREM: return RISCVISD::REMUW; } } // Converts the given 32-bit operation to a target-specific SelectionDAG node. // Because i32 isn't a legal type for RV64, these operations would otherwise // be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W // later one because the fact the operation was originally of type i32 is // lost. static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode()); SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0)); SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1)); SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1); // ReplaceNodeResults requires we maintain the same type for the return value. return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes); } void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDLoc DL(N); switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom type legalize this operation!"); case ISD::READCYCLECOUNTER: { assert(!Subtarget.is64Bit() && "READCYCLECOUNTER only has custom type legalization on riscv32"); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); SDValue RCW = DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0)); Results.push_back(RCW); Results.push_back(RCW.getValue(1)); Results.push_back(RCW.getValue(2)); break; } case ISD::SHL: case ISD::SRA: case ISD::SRL: assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); if (N->getOperand(1).getOpcode() == ISD::Constant) return; Results.push_back(customLegalizeToWOp(N, DAG)); break; case ISD::SDIV: case ISD::UDIV: case ISD::UREM: assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && Subtarget.hasStdExtM() && "Unexpected custom legalisation"); if (N->getOperand(0).getOpcode() == ISD::Constant || N->getOperand(1).getOpcode() == ISD::Constant) return; Results.push_back(customLegalizeToWOp(N, DAG)); break; case ISD::BITCAST: { assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && Subtarget.hasStdExtF() && "Unexpected custom legalisation"); SDLoc DL(N); SDValue Op0 = N->getOperand(0); if (Op0.getValueType() != MVT::f32) return; SDValue FPConv = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv)); break; } } } SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; case RISCVISD::SplitF64: { SDValue Op0 = N->getOperand(0); // If the input to SplitF64 is just BuildPairF64 then the operation is // redundant. Instead, use BuildPairF64's operands directly. if (Op0->getOpcode() == RISCVISD::BuildPairF64) return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1)); SDLoc DL(N); // It's cheaper to materialise two 32-bit integers than to load a double // from the constant pool and transfer it to integer registers through the // stack. if (ConstantFPSDNode *C = dyn_cast(Op0)) { APInt V = C->getValueAPF().bitcastToAPInt(); SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32); SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32); return DCI.CombineTo(N, Lo, Hi); } // This is a target-specific version of a DAGCombine performed in // DAGCombiner::visitBITCAST. It performs the equivalent of: // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) || !Op0.getNode()->hasOneUse()) break; SDValue NewSplitF64 = DAG.getNode(RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), Op0.getOperand(0)); SDValue Lo = NewSplitF64.getValue(0); SDValue Hi = NewSplitF64.getValue(1); APInt SignBit = APInt::getSignMask(32); if (Op0.getOpcode() == ISD::FNEG) { SDValue NewHi = DAG.getNode(ISD::XOR, DL, MVT::i32, Hi, DAG.getConstant(SignBit, DL, MVT::i32)); return DCI.CombineTo(N, Lo, NewHi); } assert(Op0.getOpcode() == ISD::FABS); SDValue NewHi = DAG.getNode(ISD::AND, DL, MVT::i32, Hi, DAG.getConstant(~SignBit, DL, MVT::i32)); return DCI.CombineTo(N, Lo, NewHi); } case RISCVISD::SLLW: case RISCVISD::SRAW: case RISCVISD::SRLW: { // Only the lower 32 bits of LHS and lower 5 bits of RHS are read. SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32); APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5); if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) || (SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI))) return SDValue(); break; } case RISCVISD::FMV_X_ANYEXTW_RV64: { SDLoc DL(N); SDValue Op0 = N->getOperand(0); // If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the // conversion is unnecessary and can be replaced with an ANY_EXTEND // of the FMV_W_X_RV64 operand. if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) { SDValue AExtOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0)); return DCI.CombineTo(N, AExtOp); } // This is a target-specific version of a DAGCombine performed in // DAGCombiner::visitBITCAST. It performs the equivalent of: // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) || !Op0.getNode()->hasOneUse()) break; SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0.getOperand(0)); APInt SignBit = APInt::getSignMask(32).sext(64); if (Op0.getOpcode() == ISD::FNEG) { return DCI.CombineTo(N, DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV, DAG.getConstant(SignBit, DL, MVT::i64))); } assert(Op0.getOpcode() == ISD::FABS); return DCI.CombineTo(N, DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV, DAG.getConstant(~SignBit, DL, MVT::i64))); } } return SDValue(); } bool RISCVTargetLowering::isDesirableToCommuteWithShift( const SDNode *N, CombineLevel Level) const { // The following folds are only desirable if `(OP _, c1 << c2)` can be // materialised in fewer instructions than `(OP _, c1)`: // // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) SDValue N0 = N->getOperand(0); EVT Ty = N0.getValueType(); if (Ty.isScalarInteger() && (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR)) { auto *C1 = dyn_cast(N0->getOperand(1)); auto *C2 = dyn_cast(N->getOperand(1)); if (C1 && C2) { APInt C1Int = C1->getAPIntValue(); APInt ShiftedC1Int = C1Int << C2->getAPIntValue(); // We can materialise `c1 << c2` into an add immediate, so it's "free", // and the combine should happen, to potentially allow further combines // later. if (ShiftedC1Int.getMinSignedBits() <= 64 && isLegalAddImmediate(ShiftedC1Int.getSExtValue())) return true; // We can materialise `c1` in an add immediate, so it's "free", and the // combine should be prevented. if (C1Int.getMinSignedBits() <= 64 && isLegalAddImmediate(C1Int.getSExtValue())) return false; // Neither constant will fit into an immediate, so find materialisation // costs. int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(), Subtarget.is64Bit()); int ShiftedC1Cost = RISCVMatInt::getIntMatCost( ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit()); // Materialising `c1` is cheaper than materialising `c1 << c2`, so the // combine should be prevented. if (C1Cost < ShiftedC1Cost) return false; } } return true; } unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { switch (Op.getOpcode()) { default: break; case RISCVISD::SLLW: case RISCVISD::SRAW: case RISCVISD::SRLW: case RISCVISD::DIVW: case RISCVISD::DIVUW: case RISCVISD::REMUW: // TODO: As the result is sign-extended, this is conservatively correct. A // more precise answer could be calculated for SRAW depending on known // bits in the shift amount. return 33; } return 1; } MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI, MachineBasicBlock *BB) { assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction"); // To read the 64-bit cycle CSR on a 32-bit target, we read the two halves. // Should the count have wrapped while it was being read, we need to try // again. // ... // read: // rdcycleh x3 # load high word of cycle // rdcycle x2 # load low word of cycle // rdcycleh x4 # load high word of cycle // bne x3, x4, read # check if high word reads match, otherwise try again // ... MachineFunction &MF = *BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); MF.insert(It, LoopMBB); MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVM_BB); MF.insert(It, DoneMBB); // Transfer the remainder of BB and its successor edges to DoneMBB. DoneMBB->splice(DoneMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); DoneMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(LoopMBB); MachineRegisterInfo &RegInfo = MF.getRegInfo(); unsigned ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); unsigned LoReg = MI.getOperand(0).getReg(); unsigned HiReg = MI.getOperand(1).getReg(); DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), HiReg) .addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding) .addReg(RISCV::X0); BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), LoReg) .addImm(RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding) .addReg(RISCV::X0); BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), ReadAgainReg) .addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding) .addReg(RISCV::X0); BuildMI(LoopMBB, DL, TII->get(RISCV::BNE)) .addReg(HiReg) .addReg(ReadAgainReg) .addMBB(LoopMBB); LoopMBB->addSuccessor(LoopMBB); LoopMBB->addSuccessor(DoneMBB); MI.eraseFromParent(); return DoneMBB; } static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI, MachineBasicBlock *BB) { assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction"); MachineFunction &MF = *BB->getParent(); DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); unsigned LoReg = MI.getOperand(0).getReg(); unsigned HiReg = MI.getOperand(1).getReg(); unsigned SrcReg = MI.getOperand(2).getReg(); const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass; int FI = MF.getInfo()->getMoveF64FrameIndex(); TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC, RI); MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, 8, 8); BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg) .addFrameIndex(FI) .addImm(0) .addMemOperand(MMO); BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg) .addFrameIndex(FI) .addImm(4) .addMemOperand(MMO); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI, MachineBasicBlock *BB) { assert(MI.getOpcode() == RISCV::BuildPairF64Pseudo && "Unexpected instruction"); MachineFunction &MF = *BB->getParent(); DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); unsigned DstReg = MI.getOperand(0).getReg(); unsigned LoReg = MI.getOperand(1).getReg(); unsigned HiReg = MI.getOperand(2).getReg(); const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass; int FI = MF.getInfo()->getMoveF64FrameIndex(); MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, 8, 8); BuildMI(*BB, MI, DL, TII.get(RISCV::SW)) .addReg(LoReg, getKillRegState(MI.getOperand(1).isKill())) .addFrameIndex(FI) .addImm(0) .addMemOperand(MMO); BuildMI(*BB, MI, DL, TII.get(RISCV::SW)) .addReg(HiReg, getKillRegState(MI.getOperand(2).isKill())) .addFrameIndex(FI) .addImm(4) .addMemOperand(MMO); TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } static bool isSelectPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { default: return false; case RISCV::Select_GPR_Using_CC_GPR: case RISCV::Select_FPR32_Using_CC_GPR: case RISCV::Select_FPR64_Using_CC_GPR: return true; } } static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, MachineBasicBlock *BB) { // To "insert" Select_* instructions, we actually have to insert the triangle // control-flow pattern. The incoming instructions know the destination vreg // to set, the condition code register to branch on, the true/false values to // select between, and the condcode to use to select the appropriate branch. // // We produce the following control flow: // HeadMBB // | \ // | IfFalseMBB // | / // TailMBB // // When we find a sequence of selects we attempt to optimize their emission // by sharing the control flow. Currently we only handle cases where we have // multiple selects with the exact same condition (same LHS, RHS and CC). // The selects may be interleaved with other instructions if the other // instructions meet some requirements we deem safe: // - They are debug instructions. Otherwise, // - They do not have side-effects, do not access memory and their inputs do // not depend on the results of the select pseudo-instructions. // The TrueV/FalseV operands of the selects cannot depend on the result of // previous selects in the sequence. // These conditions could be further relaxed. See the X86 target for a // related approach and more information. unsigned LHS = MI.getOperand(1).getReg(); unsigned RHS = MI.getOperand(2).getReg(); auto CC = static_cast(MI.getOperand(3).getImm()); SmallVector SelectDebugValues; SmallSet SelectDests; SelectDests.insert(MI.getOperand(0).getReg()); MachineInstr *LastSelectPseudo = &MI; for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI); SequenceMBBI != E; ++SequenceMBBI) { if (SequenceMBBI->isDebugInstr()) continue; else if (isSelectPseudo(*SequenceMBBI)) { if (SequenceMBBI->getOperand(1).getReg() != LHS || SequenceMBBI->getOperand(2).getReg() != RHS || SequenceMBBI->getOperand(3).getImm() != CC || SelectDests.count(SequenceMBBI->getOperand(4).getReg()) || SelectDests.count(SequenceMBBI->getOperand(5).getReg())) break; LastSelectPseudo = &*SequenceMBBI; SequenceMBBI->collectDebugValues(SelectDebugValues); SelectDests.insert(SequenceMBBI->getOperand(0).getReg()); } else { if (SequenceMBBI->hasUnmodeledSideEffects() || SequenceMBBI->mayLoadOrStore()) break; if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) { return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg()); })) break; } } const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator I = ++BB->getIterator(); MachineBasicBlock *HeadMBB = BB; MachineFunction *F = BB->getParent(); MachineBasicBlock *TailMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *IfFalseMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(I, IfFalseMBB); F->insert(I, TailMBB); // Transfer debug instructions associated with the selects to TailMBB. for (MachineInstr *DebugInstr : SelectDebugValues) { TailMBB->push_back(DebugInstr->removeFromParent()); } // Move all instructions after the sequence to TailMBB. TailMBB->splice(TailMBB->end(), HeadMBB, std::next(LastSelectPseudo->getIterator()), HeadMBB->end()); // Update machine-CFG edges by transferring all successors of the current // block to the new block which will contain the Phi nodes for the selects. TailMBB->transferSuccessorsAndUpdatePHIs(HeadMBB); // Set the successors for HeadMBB. HeadMBB->addSuccessor(IfFalseMBB); HeadMBB->addSuccessor(TailMBB); // Insert appropriate branch. unsigned Opcode = getBranchOpcodeForIntCondCode(CC); BuildMI(HeadMBB, DL, TII.get(Opcode)) .addReg(LHS) .addReg(RHS) .addMBB(TailMBB); // IfFalseMBB just falls through to TailMBB. IfFalseMBB->addSuccessor(TailMBB); // Create PHIs for all of the select pseudo-instructions. auto SelectMBBI = MI.getIterator(); auto SelectEnd = std::next(LastSelectPseudo->getIterator()); auto InsertionPoint = TailMBB->begin(); while (SelectMBBI != SelectEnd) { auto Next = std::next(SelectMBBI); if (isSelectPseudo(*SelectMBBI)) { // %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ] BuildMI(*TailMBB, InsertionPoint, SelectMBBI->getDebugLoc(), TII.get(RISCV::PHI), SelectMBBI->getOperand(0).getReg()) .addReg(SelectMBBI->getOperand(4).getReg()) .addMBB(HeadMBB) .addReg(SelectMBBI->getOperand(5).getReg()) .addMBB(IfFalseMBB); SelectMBBI->eraseFromParent(); } SelectMBBI = Next; } F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs); return TailMBB; } MachineBasicBlock * RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case RISCV::ReadCycleWide: assert(!Subtarget.is64Bit() && "ReadCycleWrite is only to be used on riscv32"); return emitReadCycleWidePseudo(MI, BB); case RISCV::Select_GPR_Using_CC_GPR: case RISCV::Select_FPR32_Using_CC_GPR: case RISCV::Select_FPR64_Using_CC_GPR: return emitSelectPseudo(MI, BB); case RISCV::BuildPairF64Pseudo: return emitBuildPairF64Pseudo(MI, BB); case RISCV::SplitF64Pseudo: return emitSplitF64Pseudo(MI, BB); } } // Calling Convention Implementation. // The expectations for frontend ABI lowering vary from target to target. // Ideally, an LLVM frontend would be able to avoid worrying about many ABI // details, but this is a longer term goal. For now, we simply try to keep the // role of the frontend as simple and well-defined as possible. The rules can // be summarised as: // * Never split up large scalar arguments. We handle them here. // * If a hardfloat calling convention is being used, and the struct may be // passed in a pair of registers (fp+fp, int+fp), and both registers are // available, then pass as two separate arguments. If either the GPRs or FPRs // are exhausted, then pass according to the rule below. // * If a struct could never be passed in registers or directly in a stack // slot (as it is larger than 2*XLEN and the floating point rules don't // apply), then pass it using a pointer with the byval attribute. // * If a struct is less than 2*XLEN, then coerce to either a two-element // word-sized array or a 2*XLEN scalar (depending on alignment). // * The frontend can determine whether a struct is returned by reference or // not based on its size and fields. If it will be returned by reference, the // frontend must modify the prototype so a pointer with the sret annotation is // passed as the first argument. This is not necessary for large scalar // returns. // * Struct return values and varargs should be coerced to structs containing // register-size fields in the same situations they would be for fixed // arguments. static const MCPhysReg ArgGPRs[] = { RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17 }; static const MCPhysReg ArgFPR32s[] = { RISCV::F10_32, RISCV::F11_32, RISCV::F12_32, RISCV::F13_32, RISCV::F14_32, RISCV::F15_32, RISCV::F16_32, RISCV::F17_32 }; static const MCPhysReg ArgFPR64s[] = { RISCV::F10_64, RISCV::F11_64, RISCV::F12_64, RISCV::F13_64, RISCV::F14_64, RISCV::F15_64, RISCV::F16_64, RISCV::F17_64 }; // Pass a 2*XLEN argument that has been split into two XLEN values through // registers or the stack as necessary. static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2, MVT ValVT2, MVT LocVT2, ISD::ArgFlagsTy ArgFlags2) { unsigned XLenInBytes = XLen / 8; if (unsigned Reg = State.AllocateReg(ArgGPRs)) { // At least one half can be passed via register. State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg, VA1.getLocVT(), CCValAssign::Full)); } else { // Both halves must be passed on the stack, with proper alignment. unsigned StackAlign = std::max(XLenInBytes, ArgFlags1.getOrigAlign()); State.addLoc( CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(), State.AllocateStack(XLenInBytes, StackAlign), VA1.getLocVT(), CCValAssign::Full)); State.addLoc(CCValAssign::getMem( ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2, CCValAssign::Full)); return false; } if (unsigned Reg = State.AllocateReg(ArgGPRs)) { // The second half can also be passed via register. State.addLoc( CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full)); } else { // The second half is passed via the stack, without additional alignment. State.addLoc(CCValAssign::getMem( ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2, CCValAssign::Full)); } return false; } // Implements the RISC-V calling convention. Returns true upon failure. static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) { unsigned XLen = DL.getLargestLegalIntTypeSizeInBits(); assert(XLen == 32 || XLen == 64); MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; // Any return value split in to more than two values can't be returned // directly. if (IsRet && ValNo > 1) return true; // UseGPRForF32 if targeting one of the soft-float ABIs, if passing a // variadic argument, or if no F32 argument registers are available. bool UseGPRForF32 = true; // UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a // variadic argument, or if no F64 argument registers are available. bool UseGPRForF64 = true; switch (ABI) { default: llvm_unreachable("Unexpected ABI"); case RISCVABI::ABI_ILP32: case RISCVABI::ABI_LP64: break; case RISCVABI::ABI_ILP32F: case RISCVABI::ABI_LP64F: UseGPRForF32 = !IsFixed; break; case RISCVABI::ABI_ILP32D: case RISCVABI::ABI_LP64D: UseGPRForF32 = !IsFixed; UseGPRForF64 = !IsFixed; break; } if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s)) UseGPRForF32 = true; if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s)) UseGPRForF64 = true; // From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local // variables rather than directly checking against the target ABI. if (UseGPRForF32 && ValVT == MVT::f32) { LocVT = XLenVT; LocInfo = CCValAssign::BCvt; } else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) { LocVT = MVT::i64; LocInfo = CCValAssign::BCvt; } // If this is a variadic argument, the RISC-V calling convention requires // that it is assigned an 'even' or 'aligned' register if it has 8-byte // alignment (RV32) or 16-byte alignment (RV64). An aligned register should // be used regardless of whether the original argument was split during // legalisation or not. The argument will not be passed by registers if the // original type is larger than 2*XLEN, so the register alignment rule does // not apply. unsigned TwoXLenInBytes = (2 * XLen) / 8; if (!IsFixed && ArgFlags.getOrigAlign() == TwoXLenInBytes && DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes) { unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); // Skip 'odd' register if necessary. if (RegIdx != array_lengthof(ArgGPRs) && RegIdx % 2 == 1) State.AllocateReg(ArgGPRs); } SmallVectorImpl &PendingLocs = State.getPendingLocs(); SmallVectorImpl &PendingArgFlags = State.getPendingArgFlags(); assert(PendingLocs.size() == PendingArgFlags.size() && "PendingLocs and PendingArgFlags out of sync"); // Handle passing f64 on RV32D with a soft float ABI or when floating point // registers are exhausted. if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) { assert(!ArgFlags.isSplit() && PendingLocs.empty() && "Can't lower f64 if it is split"); // Depending on available argument GPRS, f64 may be passed in a pair of // GPRs, split between a GPR and the stack, or passed completely on the // stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these // cases. unsigned Reg = State.AllocateReg(ArgGPRs); LocVT = MVT::i32; if (!Reg) { unsigned StackOffset = State.AllocateStack(8, 8); State.addLoc( CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); return false; } if (!State.AllocateReg(ArgGPRs)) State.AllocateStack(4, 4); State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; } // Split arguments might be passed indirectly, so keep track of the pending // values. if (ArgFlags.isSplit() || !PendingLocs.empty()) { LocVT = XLenVT; LocInfo = CCValAssign::Indirect; PendingLocs.push_back( CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); PendingArgFlags.push_back(ArgFlags); if (!ArgFlags.isSplitEnd()) { return false; } } // If the split argument only had two elements, it should be passed directly // in registers or on the stack. if (ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) { assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()"); // Apply the normal calling convention rules to the first half of the // split argument. CCValAssign VA = PendingLocs[0]; ISD::ArgFlagsTy AF = PendingArgFlags[0]; PendingLocs.clear(); PendingArgFlags.clear(); return CC_RISCVAssign2XLen(XLen, State, VA, AF, ValNo, ValVT, LocVT, ArgFlags); } // Allocate to a register if possible, or else a stack slot. unsigned Reg; if (ValVT == MVT::f32 && !UseGPRForF32) Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s); else if (ValVT == MVT::f64 && !UseGPRForF64) Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s); else Reg = State.AllocateReg(ArgGPRs); unsigned StackOffset = Reg ? 0 : State.AllocateStack(XLen / 8, XLen / 8); // If we reach this point and PendingLocs is non-empty, we must be at the // end of a split argument that must be passed indirectly. if (!PendingLocs.empty()) { assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()"); assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()"); for (auto &It : PendingLocs) { if (Reg) It.convertToReg(Reg); else It.convertToMem(StackOffset); State.addLoc(It); } PendingLocs.clear(); PendingArgFlags.clear(); return false; } assert((!UseGPRForF32 || !UseGPRForF64 || LocVT == XLenVT) && "Expected an XLenVT at this stage"); if (Reg) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; } // When an f32 or f64 is passed on the stack, no bit-conversion is needed. if (ValVT == MVT::f32 || ValVT == MVT::f64) { LocVT = ValVT; LocInfo = CCValAssign::Full; } State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); return false; } void RISCVTargetLowering::analyzeInputArgs( MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, bool IsRet) const { unsigned NumArgs = Ins.size(); FunctionType *FType = MF.getFunction().getFunctionType(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Ins[i].VT; ISD::ArgFlagsTy ArgFlags = Ins[i].Flags; Type *ArgTy = nullptr; if (IsRet) ArgTy = FType->getReturnType(); else if (Ins[i].isOrigArg()) ArgTy = FType->getParamType(Ins[i].getOrigArgIndex()); RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, /*IsRet=*/true, IsRet, ArgTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << '\n'); llvm_unreachable(nullptr); } } } void RISCVTargetLowering::analyzeOutputArgs( MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Outs, bool IsRet, CallLoweringInfo *CLI) const { unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; i++) { MVT ArgVT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr; RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << "\n"); llvm_unreachable(nullptr); } } } // Convert Val to a ValVT. Should not be called for CCValAssign::Indirect // values. static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val, const CCValAssign &VA, const SDLoc &DL) { switch (VA.getLocInfo()) { default: llvm_unreachable("Unexpected CCValAssign::LocInfo"); case CCValAssign::Full: break; case CCValAssign::BCvt: if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) { Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val); break; } Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; } return Val; } // The caller is responsible for loading the full value if the argument is // passed with CCValAssign::Indirect. static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain, const CCValAssign &VA, const SDLoc &DL) { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); EVT LocVT = VA.getLocVT(); SDValue Val; const TargetRegisterClass *RC; switch (LocVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected register type"); case MVT::i32: case MVT::i64: RC = &RISCV::GPRRegClass; break; case MVT::f32: RC = &RISCV::FPR32RegClass; break; case MVT::f64: RC = &RISCV::FPR64RegClass; break; } unsigned VReg = RegInfo.createVirtualRegister(RC); RegInfo.addLiveIn(VA.getLocReg(), VReg); Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT); if (VA.getLocInfo() == CCValAssign::Indirect) return Val; return convertLocVTToValVT(DAG, Val, VA, DL); } static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val, const CCValAssign &VA, const SDLoc &DL) { EVT LocVT = VA.getLocVT(); switch (VA.getLocInfo()) { default: llvm_unreachable("Unexpected CCValAssign::LocInfo"); case CCValAssign::Full: break; case CCValAssign::BCvt: if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) { Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val); break; } Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val); break; } return Val; } // The caller is responsible for loading the full value if the argument is // passed with CCValAssign::Indirect. static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain, const CCValAssign &VA, const SDLoc &DL) { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); EVT LocVT = VA.getLocVT(); EVT ValVT = VA.getValVT(); EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0)); int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, VA.getLocMemOffset(), /*Immutable=*/true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val; ISD::LoadExtType ExtType; switch (VA.getLocInfo()) { default: llvm_unreachable("Unexpected CCValAssign::LocInfo"); case CCValAssign::Full: case CCValAssign::Indirect: case CCValAssign::BCvt: ExtType = ISD::NON_EXTLOAD; break; } Val = DAG.getExtLoad( ExtType, DL, LocVT, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT); return Val; } static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain, const CCValAssign &VA, const SDLoc &DL) { assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 && "Unexpected VA"); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); if (VA.isMemLoc()) { // f64 is passed on the stack. int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*Immutable=*/true); SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); return DAG.getLoad(MVT::f64, DL, Chain, FIN, MachinePointerInfo::getFixedStack(MF, FI)); } assert(VA.isRegLoc() && "Expected register VA assignment"); unsigned LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); RegInfo.addLiveIn(VA.getLocReg(), LoVReg); SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32); SDValue Hi; if (VA.getLocReg() == RISCV::X17) { // Second half of f64 is passed on the stack. int FI = MFI.CreateFixedObject(4, 0, /*Immutable=*/true); SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN, MachinePointerInfo::getFixedStack(MF, FI)); } else { // Second half of f64 is passed in another GPR. unsigned HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass); RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg); Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32); } return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); } // Transform physical registers into virtual registers. SDValue RISCVTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const { switch (CallConv) { default: report_fatal_error("Unsupported calling convention"); case CallingConv::C: case CallingConv::Fast: break; } MachineFunction &MF = DAG.getMachineFunction(); const Function &Func = MF.getFunction(); if (Func.hasFnAttribute("interrupt")) { if (!Func.arg_empty()) report_fatal_error( "Functions with the interrupt attribute cannot have arguments!"); StringRef Kind = MF.getFunction().getFnAttribute("interrupt").getValueAsString(); if (!(Kind == "user" || Kind == "supervisor" || Kind == "machine")) report_fatal_error( "Function interrupt attribute argument not supported!"); } EVT PtrVT = getPointerTy(DAG.getDataLayout()); MVT XLenVT = Subtarget.getXLenVT(); unsigned XLenInBytes = Subtarget.getXLen() / 8; // Used with vargs to acumulate store chains. std::vector OutChains; // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue ArgValue; // Passing f64 on RV32D with a soft float ABI must be handled as a special // case. if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL); else if (VA.isRegLoc()) ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL); else ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL); if (VA.getLocInfo() == CCValAssign::Indirect) { // If the original argument was split and passed by reference (e.g. i128 // on RV32), we need to load all parts of it here (using the same // address). InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo())); unsigned ArgIndex = Ins[i].OrigArgIndex; assert(Ins[i].PartOffset == 0); while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) { CCValAssign &PartVA = ArgLocs[i + 1]; unsigned PartOffset = Ins[i + 1].PartOffset; SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue, DAG.getIntPtrConstant(PartOffset, DL)); InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address, MachinePointerInfo())); ++i; } continue; } InVals.push_back(ArgValue); } if (IsVarArg) { ArrayRef ArgRegs = makeArrayRef(ArgGPRs); unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs); const TargetRegisterClass *RC = &RISCV::GPRRegClass; MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); RISCVMachineFunctionInfo *RVFI = MF.getInfo(); // Offset of the first variable argument from stack pointer, and size of // the vararg save area. For now, the varargs save area is either zero or // large enough to hold a0-a7. int VaArgOffset, VarArgsSaveSize; // If all registers are allocated, then all varargs must be passed on the // stack and we don't need to save any argregs. if (ArgRegs.size() == Idx) { VaArgOffset = CCInfo.getNextStackOffset(); VarArgsSaveSize = 0; } else { VarArgsSaveSize = XLenInBytes * (ArgRegs.size() - Idx); VaArgOffset = -VarArgsSaveSize; } // Record the frame index of the first variable argument // which is a value necessary to VASTART. int FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true); RVFI->setVarArgsFrameIndex(FI); // If saving an odd number of registers then create an extra stack slot to // ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures // offsets to even-numbered registered remain 2*XLEN-aligned. if (Idx % 2) { FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes, true); VarArgsSaveSize += XLenInBytes; } // Copy the integer registers that may have been used for passing varargs // to the vararg save area. for (unsigned I = Idx; I < ArgRegs.size(); ++I, VaArgOffset += XLenInBytes) { const unsigned Reg = RegInfo.createVirtualRegister(RC); RegInfo.addLiveIn(ArgRegs[I], Reg); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT); FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true); SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff, MachinePointerInfo::getFixedStack(MF, FI)); cast(Store.getNode()) ->getMemOperand() ->setValue((Value *)nullptr); OutChains.push_back(Store); } RVFI->setVarArgsSaveSize(VarArgsSaveSize); } // All stores are grouped in one node to allow the matching between // the size of Ins and InVals. This only happens for vararg functions. if (!OutChains.empty()) { OutChains.push_back(Chain); Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains); } return Chain; } /// isEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. /// Note: This is modelled after ARM's IsEligibleForTailCallOptimization. bool RISCVTargetLowering::isEligibleForTailCallOptimization( CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, const SmallVector &ArgLocs) const { auto &Callee = CLI.Callee; auto CalleeCC = CLI.CallConv; auto IsVarArg = CLI.IsVarArg; auto &Outs = CLI.Outs; auto &Caller = MF.getFunction(); auto CallerCC = Caller.getCallingConv(); // Do not tail call opt functions with "disable-tail-calls" attribute. if (Caller.getFnAttribute("disable-tail-calls").getValueAsString() == "true") return false; // Exception-handling functions need a special set of instructions to // indicate a return to the hardware. Tail-calling another function would // probably break this. // TODO: The "interrupt" attribute isn't currently defined by RISC-V. This // should be expanded as new function attributes are introduced. if (Caller.hasFnAttribute("interrupt")) return false; // Do not tail call opt functions with varargs. if (IsVarArg) return false; // Do not tail call opt if the stack is used to pass parameters. if (CCInfo.getNextStackOffset() != 0) return false; // Do not tail call opt if any parameters need to be passed indirectly. // Since long doubles (fp128) and i128 are larger than 2*XLEN, they are // passed indirectly. So the address of the value will be passed in a // register, or if not available, then the address is put on the stack. In // order to pass indirectly, space on the stack often needs to be allocated // in order to store the value. In this case the CCInfo.getNextStackOffset() // != 0 check is not enough and we need to check if any CCValAssign ArgsLocs // are passed CCValAssign::Indirect. for (auto &VA : ArgLocs) if (VA.getLocInfo() == CCValAssign::Indirect) return false; // Do not tail call opt if either caller or callee uses struct return // semantics. auto IsCallerStructRet = Caller.hasStructRetAttr(); auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet(); if (IsCallerStructRet || IsCalleeStructRet) return false; // Externally-defined functions with weak linkage should not be // tail-called. The behaviour of branch instructions in this situation (as // used for tail calls) is implementation-defined, so we cannot rely on the // linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); if (GV->hasExternalWeakLinkage()) return false; } // The callee has to preserve all registers the caller needs to preserve. const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (CalleeCC != CallerCC) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } // Byval parameters hand the function a pointer directly into the stack area // we want to reuse during a tail call. Working around this *is* possible // but less efficient and uglier in LowerCall. for (auto &Arg : Outs) if (Arg.Flags.isByVal()) return false; return true; } // Lower a call to a callseq_start + CALL + callseq_end chain, and add input // and output parameter nodes. SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &DL = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; EVT PtrVT = getPointerTy(DAG.getDataLayout()); MVT XLenVT = Subtarget.getXLenVT(); MachineFunction &MF = DAG.getMachineFunction(); // Analyze the operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI); // Check if it's really possible to do a tail call. if (IsTailCall) IsTailCall = isEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, ArgLocs); if (IsTailCall) ++NumTailCalls; else if (CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = ArgCCInfo.getNextStackOffset(); // Create local copies for byval args SmallVector ByValArgs; for (unsigned i = 0, e = Outs.size(); i != e; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; if (!Flags.isByVal()) continue; SDValue Arg = OutVals[i]; unsigned Size = Flags.getByValSize(); unsigned Align = Flags.getByValAlign(); int FI = MF.getFrameInfo().CreateStackObject(Size, Align, /*isSS=*/false); SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT); Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align, /*IsVolatile=*/false, /*AlwaysInline=*/false, IsTailCall, MachinePointerInfo(), MachinePointerInfo()); ByValArgs.push_back(FIPtr); } if (!IsTailCall) Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); // Copy argument values to their designated locations. SmallVector, 8> RegsToPass; SmallVector MemOpChains; SDValue StackPtr; for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue ArgValue = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; // Handle passing f64 on RV32D with a soft float ABI as a special case. bool IsF64OnRV32DSoftABI = VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64; if (IsF64OnRV32DSoftABI && VA.isRegLoc()) { SDValue SplitF64 = DAG.getNode( RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), ArgValue); SDValue Lo = SplitF64.getValue(0); SDValue Hi = SplitF64.getValue(1); unsigned RegLo = VA.getLocReg(); RegsToPass.push_back(std::make_pair(RegLo, Lo)); if (RegLo == RISCV::X17) { // Second half of f64 is passed on the stack. // Work out the address of the stack slot. if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT); // Emit the store. MemOpChains.push_back( DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo())); } else { // Second half of f64 is passed in another GPR. unsigned RegHigh = RegLo + 1; RegsToPass.push_back(std::make_pair(RegHigh, Hi)); } continue; } // IsF64OnRV32DSoftABI && VA.isMemLoc() is handled below in the same way // as any other MemLoc. // Promote the value if needed. // For now, only handle fully promoted and indirect arguments. if (VA.getLocInfo() == CCValAssign::Indirect) { // Store the argument in a stack slot and pass its address. SDValue SpillSlot = DAG.CreateStackTemporary(Outs[i].ArgVT); int FI = cast(SpillSlot)->getIndex(); MemOpChains.push_back( DAG.getStore(Chain, DL, ArgValue, SpillSlot, MachinePointerInfo::getFixedStack(MF, FI))); // If the original argument was split (e.g. i128), we need // to store all parts of it here (and pass just one address). unsigned ArgIndex = Outs[i].OrigArgIndex; assert(Outs[i].PartOffset == 0); while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) { SDValue PartValue = OutVals[i + 1]; unsigned PartOffset = Outs[i + 1].PartOffset; SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, DAG.getIntPtrConstant(PartOffset, DL)); MemOpChains.push_back( DAG.getStore(Chain, DL, PartValue, Address, MachinePointerInfo::getFixedStack(MF, FI))); ++i; } ArgValue = SpillSlot; } else { ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL); } // Use local copy if it is a byval arg. if (Flags.isByVal()) ArgValue = ByValArgs[j++]; if (VA.isRegLoc()) { // Queue up the argument copies and emit them at the end. RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue)); } else { assert(VA.isMemLoc() && "Argument not register or memory"); assert(!IsTailCall && "Tail call not allowed if stack is used " "for passing parameters"); // Work out the address of the stack slot. if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT); SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, DAG.getIntPtrConstant(VA.getLocMemOffset(), DL)); // Emit the store. MemOpChains.push_back( DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo())); } } // Join the stores, which are independent of one another. if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); SDValue Glue; // Build a sequence of copy-to-reg nodes, chained and glued together. for (auto &Reg : RegsToPass) { Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue); Glue = Chain.getValue(1); } // If the callee is a GlobalAddress/ExternalSymbol node, turn it into a // TargetGlobalAddress/TargetExternalSymbol node so that legalize won't // split it and then direct call can be matched by PseudoCALL. if (GlobalAddressSDNode *S = dyn_cast(Callee)) { const GlobalValue *GV = S->getGlobal(); unsigned OpFlags = RISCVII::MO_CALL; if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)) OpFlags = RISCVII::MO_PLT; Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { unsigned OpFlags = RISCVII::MO_CALL; if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(), nullptr)) OpFlags = RISCVII::MO_PLT; Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags); } // The first call operand is the chain and the second is the target address. SmallVector Ops; Ops.push_back(Chain); Ops.push_back(Callee); // Add argument registers to the end of the list so that they are // known live into the call. for (auto &Reg : RegsToPass) Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); if (!IsTailCall) { // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); } // Glue the call to the argument copies, if any. if (Glue.getNode()) Ops.push_back(Glue); // Emit the call. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); if (IsTailCall) { MF.getFrameInfo().setHasTailCall(); return DAG.getNode(RISCVISD::TAIL, DL, NodeTys, Ops); } Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops); Glue = Chain.getValue(1); // Mark the end of the call, which is glued to the call itself. Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, DL, PtrVT, true), DAG.getConstant(0, DL, PtrVT, true), Glue, DL); Glue = Chain.getValue(1); // Assign locations to each value returned by this call. SmallVector RVLocs; CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext()); analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true); // Copy all of the result registers out of their specified physreg. for (auto &VA : RVLocs) { // Copy the value out SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue); // Glue the RetValue to the end of the call sequence Chain = RetValue.getValue(1); Glue = RetValue.getValue(2); if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) { assert(VA.getLocReg() == ArgGPRs[0] && "Unexpected reg assignment"); SDValue RetValue2 = DAG.getCopyFromReg(Chain, DL, ArgGPRs[1], MVT::i32, Glue); Chain = RetValue2.getValue(1); Glue = RetValue2.getValue(2); RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue, RetValue2); } RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL); InVals.push_back(RetValue); } return Chain; } bool RISCVTargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); for (unsigned i = 0, e = Outs.size(); i != e; ++i) { MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr)) return false; } return true; } SDValue RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { // Stores the assignment of the return value to a location. SmallVector RVLocs; // Info about the registers and stack slot. CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true, nullptr); SDValue Glue; SmallVector RetOps(1, Chain); // Copy the result values into the output registers. for (unsigned i = 0, e = RVLocs.size(); i < e; ++i) { SDValue Val = OutVals[i]; CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) { // Handle returning f64 on RV32D with a soft float ABI. assert(VA.isRegLoc() && "Expected return via registers"); SDValue SplitF64 = DAG.getNode(RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), Val); SDValue Lo = SplitF64.getValue(0); SDValue Hi = SplitF64.getValue(1); unsigned RegLo = VA.getLocReg(); unsigned RegHi = RegLo + 1; Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue); Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(RegLo, MVT::i32)); Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue); Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(RegHi, MVT::i32)); } else { // Handle a 'normal' return. Val = convertValVTToLocVT(DAG, Val, VA, DL); Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue); // Guarantee that all emitted copies are stuck together. Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } } RetOps[0] = Chain; // Update chain. // Add the glue node if we have it. if (Glue.getNode()) { RetOps.push_back(Glue); } // Interrupt service routines use different return instructions. const Function &Func = DAG.getMachineFunction().getFunction(); if (Func.hasFnAttribute("interrupt")) { if (!Func.getReturnType()->isVoidTy()) report_fatal_error( "Functions with the interrupt attribute must have void return type!"); MachineFunction &MF = DAG.getMachineFunction(); StringRef Kind = MF.getFunction().getFnAttribute("interrupt").getValueAsString(); unsigned RetOpc; if (Kind == "user") RetOpc = RISCVISD::URET_FLAG; else if (Kind == "supervisor") RetOpc = RISCVISD::SRET_FLAG; else RetOpc = RISCVISD::MRET_FLAG; return DAG.getNode(RetOpc, DL, MVT::Other, RetOps); } return DAG.getNode(RISCVISD::RET_FLAG, DL, MVT::Other, RetOps); } const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((RISCVISD::NodeType)Opcode) { case RISCVISD::FIRST_NUMBER: break; case RISCVISD::RET_FLAG: return "RISCVISD::RET_FLAG"; case RISCVISD::URET_FLAG: return "RISCVISD::URET_FLAG"; case RISCVISD::SRET_FLAG: return "RISCVISD::SRET_FLAG"; case RISCVISD::MRET_FLAG: return "RISCVISD::MRET_FLAG"; case RISCVISD::CALL: return "RISCVISD::CALL"; case RISCVISD::SELECT_CC: return "RISCVISD::SELECT_CC"; case RISCVISD::BuildPairF64: return "RISCVISD::BuildPairF64"; case RISCVISD::SplitF64: return "RISCVISD::SplitF64"; case RISCVISD::TAIL: return "RISCVISD::TAIL"; case RISCVISD::SLLW: return "RISCVISD::SLLW"; case RISCVISD::SRAW: return "RISCVISD::SRAW"; case RISCVISD::SRLW: return "RISCVISD::SRLW"; case RISCVISD::DIVW: return "RISCVISD::DIVW"; case RISCVISD::DIVUW: return "RISCVISD::DIVUW"; case RISCVISD::REMUW: return "RISCVISD::REMUW"; case RISCVISD::FMV_W_X_RV64: return "RISCVISD::FMV_W_X_RV64"; case RISCVISD::FMV_X_ANYEXTW_RV64: return "RISCVISD::FMV_X_ANYEXTW_RV64"; case RISCVISD::READ_CYCLE_WIDE: return "RISCVISD::READ_CYCLE_WIDE"; } return nullptr; } /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. RISCVTargetLowering::ConstraintType RISCVTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'f': return C_RegisterClass; case 'I': case 'J': case 'K': return C_Immediate; + case 'A': + return C_Memory; } } return TargetLowering::getConstraintType(Constraint); } std::pair RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to a // RISCV register class. if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': return std::make_pair(0U, &RISCV::GPRRegClass); case 'f': if (Subtarget.hasStdExtF() && VT == MVT::f32) return std::make_pair(0U, &RISCV::FPR32RegClass); if (Subtarget.hasStdExtD() && VT == MVT::f64) return std::make_pair(0U, &RISCV::FPR64RegClass); break; default: break; } } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} + +unsigned +RISCVTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const { + // Currently only support length 1 constraints. + if (ConstraintCode.size() == 1) { + switch (ConstraintCode[0]) { + case 'A': + return InlineAsm::Constraint_A; + default: + break; + } + } + + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } void RISCVTargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const { // Currently only support length 1 constraints. if (Constraint.length() == 1) { switch (Constraint[0]) { case 'I': // Validate & create a 12-bit signed immediate operand. if (auto *C = dyn_cast(Op)) { uint64_t CVal = C->getSExtValue(); if (isInt<12>(CVal)) Ops.push_back( DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT())); } return; case 'J': // Validate & create an integer zero operand. if (auto *C = dyn_cast(Op)) if (C->getZExtValue() == 0) Ops.push_back( DAG.getTargetConstant(0, SDLoc(Op), Subtarget.getXLenVT())); return; case 'K': // Validate & create a 5-bit unsigned immediate operand. if (auto *C = dyn_cast(Op)) { uint64_t CVal = C->getZExtValue(); if (isUInt<5>(CVal)) Ops.push_back( DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT())); } return; default: break; } } TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { if (isa(Inst) && Ord == AtomicOrdering::SequentiallyConsistent) return Builder.CreateFence(Ord); if (isa(Inst) && isReleaseOrStronger(Ord)) return Builder.CreateFence(AtomicOrdering::Release); return nullptr; } Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { if (isa(Inst) && isAcquireOrStronger(Ord)) return Builder.CreateFence(AtomicOrdering::Acquire); return nullptr; } TargetLowering::AtomicExpansionKind RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { // atomicrmw {fadd,fsub} must be expanded to use compare-exchange, as floating // point operations can't be used in an lr/sc sequence without breaking the // forward-progress guarantee. if (AI->isFloatingPointOperation()) return AtomicExpansionKind::CmpXChg; unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size == 8 || Size == 16) return AtomicExpansionKind::MaskedIntrinsic; return AtomicExpansionKind::None; } static Intrinsic::ID getIntrinsicForMaskedAtomicRMWBinOp(unsigned XLen, AtomicRMWInst::BinOp BinOp) { if (XLen == 32) { switch (BinOp) { default: llvm_unreachable("Unexpected AtomicRMW BinOp"); case AtomicRMWInst::Xchg: return Intrinsic::riscv_masked_atomicrmw_xchg_i32; case AtomicRMWInst::Add: return Intrinsic::riscv_masked_atomicrmw_add_i32; case AtomicRMWInst::Sub: return Intrinsic::riscv_masked_atomicrmw_sub_i32; case AtomicRMWInst::Nand: return Intrinsic::riscv_masked_atomicrmw_nand_i32; case AtomicRMWInst::Max: return Intrinsic::riscv_masked_atomicrmw_max_i32; case AtomicRMWInst::Min: return Intrinsic::riscv_masked_atomicrmw_min_i32; case AtomicRMWInst::UMax: return Intrinsic::riscv_masked_atomicrmw_umax_i32; case AtomicRMWInst::UMin: return Intrinsic::riscv_masked_atomicrmw_umin_i32; } } if (XLen == 64) { switch (BinOp) { default: llvm_unreachable("Unexpected AtomicRMW BinOp"); case AtomicRMWInst::Xchg: return Intrinsic::riscv_masked_atomicrmw_xchg_i64; case AtomicRMWInst::Add: return Intrinsic::riscv_masked_atomicrmw_add_i64; case AtomicRMWInst::Sub: return Intrinsic::riscv_masked_atomicrmw_sub_i64; case AtomicRMWInst::Nand: return Intrinsic::riscv_masked_atomicrmw_nand_i64; case AtomicRMWInst::Max: return Intrinsic::riscv_masked_atomicrmw_max_i64; case AtomicRMWInst::Min: return Intrinsic::riscv_masked_atomicrmw_min_i64; case AtomicRMWInst::UMax: return Intrinsic::riscv_masked_atomicrmw_umax_i64; case AtomicRMWInst::UMin: return Intrinsic::riscv_masked_atomicrmw_umin_i64; } } llvm_unreachable("Unexpected XLen\n"); } Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic( IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr, Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const { unsigned XLen = Subtarget.getXLen(); Value *Ordering = Builder.getIntN(XLen, static_cast(AI->getOrdering())); Type *Tys[] = {AlignedAddr->getType()}; Function *LrwOpScwLoop = Intrinsic::getDeclaration( AI->getModule(), getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys); if (XLen == 64) { Incr = Builder.CreateSExt(Incr, Builder.getInt64Ty()); Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty()); ShiftAmt = Builder.CreateSExt(ShiftAmt, Builder.getInt64Ty()); } Value *Result; // Must pass the shift amount needed to sign extend the loaded value prior // to performing a signed comparison for min/max. ShiftAmt is the number of // bits to shift the value into position. Pass XLen-ShiftAmt-ValWidth, which // is the number of bits to left+right shift the value in order to // sign-extend. if (AI->getOperation() == AtomicRMWInst::Min || AI->getOperation() == AtomicRMWInst::Max) { const DataLayout &DL = AI->getModule()->getDataLayout(); unsigned ValWidth = DL.getTypeStoreSizeInBits(AI->getValOperand()->getType()); Value *SextShamt = Builder.CreateSub(Builder.getIntN(XLen, XLen - ValWidth), ShiftAmt); Result = Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, SextShamt, Ordering}); } else { Result = Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering}); } if (XLen == 64) Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); return Result; } TargetLowering::AtomicExpansionKind RISCVTargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *CI) const { unsigned Size = CI->getCompareOperand()->getType()->getPrimitiveSizeInBits(); if (Size == 8 || Size == 16) return AtomicExpansionKind::MaskedIntrinsic; return AtomicExpansionKind::None; } Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr, Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const { unsigned XLen = Subtarget.getXLen(); Value *Ordering = Builder.getIntN(XLen, static_cast(Ord)); Intrinsic::ID CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i32; if (XLen == 64) { CmpVal = Builder.CreateSExt(CmpVal, Builder.getInt64Ty()); NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty()); Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty()); CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64; } Type *Tys[] = {AlignedAddr->getType()}; Function *MaskedCmpXchg = Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys); Value *Result = Builder.CreateCall( MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering}); if (XLen == 64) Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); return Result; } unsigned RISCVTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { return RISCV::X10; } unsigned RISCVTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { return RISCV::X11; } Index: vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.h =================================================================== --- vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.h (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.h (revision 351709) @@ -1,211 +1,214 @@ //===-- RISCVISelLowering.h - RISCV DAG Lowering Interface ------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that RISCV uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H #define LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H #include "RISCV.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" namespace llvm { class RISCVSubtarget; namespace RISCVISD { enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, RET_FLAG, URET_FLAG, SRET_FLAG, MRET_FLAG, CALL, SELECT_CC, BuildPairF64, SplitF64, TAIL, // RV64I shifts, directly matching the semantics of the named RISC-V // instructions. SLLW, SRAW, SRLW, // 32-bit operations from RV64M that can't be simply matched with a pattern // at instruction selection time. DIVW, DIVUW, REMUW, // FPR32<->GPR transfer operations for RV64. Needed as an i32<->f32 bitcast // is not legal on RV64. FMV_W_X_RV64 matches the semantics of the FMV.W.X. // FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result. // This is a more convenient semantic for producing dagcombines that remove // unnecessary GPR->FPR->GPR moves. FMV_W_X_RV64, FMV_X_ANYEXTW_RV64, // READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target // (returns (Lo, Hi)). It takes a chain operand. READ_CYCLE_WIDE }; } class RISCVTargetLowering : public TargetLowering { const RISCVSubtarget &Subtarget; public: explicit RISCVTargetLowering(const TargetMachine &TM, const RISCVSubtarget &STI); bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; bool isLegalICmpImmediate(int64_t Imm) const override; bool isLegalAddImmediate(int64_t Imm) const override; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override; bool hasBitPreservingFPLogic(EVT VT) const override; // Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override; // This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; ConstraintType getConstraintType(StringRef Constraint) const override; + + unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override; + std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { return VT.isScalarInteger(); } bool shouldInsertFencesForAtomic(const Instruction *I) const override { return isa(I) || isa(I); } Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; ISD::NodeType getExtendForAtomicOps() const override { return ISD::SIGN_EXTEND; } bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { if (DAG.getMachineFunction().getFunction().hasMinSize()) return false; return true; } bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. unsigned getExceptionPointerRegister(const Constant *PersonalityFn) const override; /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. unsigned getExceptionSelectorRegister(const Constant *PersonalityFn) const override; private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, bool IsRet) const; void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Outs, bool IsRet, CallLoweringInfo *CLI) const; // Lower incoming arguments, copy physregs into vregs SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override { return true; } template SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const; SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, bool UseGOT) const; SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const; bool shouldConsiderGEPOffsetSplit() const override { return true; } SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const; bool isEligibleForTailCallOptimization( CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, const SmallVector &ArgLocs) const; TargetLowering::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; virtual Value *emitMaskedAtomicRMWIntrinsic( IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr, Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const override; TargetLowering::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CI) const override; virtual Value * emitMaskedAtomicCmpXchgIntrinsic(IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr, Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const override; }; } #endif Index: vendor/llvm/dist-release_90/lib/Target/TargetMachine.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Target/TargetMachine.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/TargetMachine.cpp (revision 351709) @@ -1,282 +1,284 @@ //===-- TargetMachine.cpp - General Target Information ---------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file describes the general parts of a Target machine. // //===----------------------------------------------------------------------===// #include "llvm/Target/TargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Mangler.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/SectionKind.h" #include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; //--------------------------------------------------------------------------- // TargetMachine Class // TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options) : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr), RequireStructuredCFG(false), DefaultOptions(Options), Options(Options) { } TargetMachine::~TargetMachine() = default; bool TargetMachine::isPositionIndependent() const { return getRelocationModel() == Reloc::PIC_; } /// Reset the target options based on the function's attributes. // FIXME: This function needs to go away for a number of reasons: // a) global state on the TargetMachine is terrible in general, // b) these target options should be passed only on the function // and not on the TargetMachine (via TargetOptions) at all. void TargetMachine::resetTargetOptions(const Function &F) const { #define RESET_OPTION(X, Y) \ do { \ if (F.hasFnAttribute(Y)) \ Options.X = (F.getFnAttribute(Y).getValueAsString() == "true"); \ else \ Options.X = DefaultOptions.X; \ } while (0) RESET_OPTION(UnsafeFPMath, "unsafe-fp-math"); RESET_OPTION(NoInfsFPMath, "no-infs-fp-math"); RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math"); RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math"); RESET_OPTION(NoTrappingFPMath, "no-trapping-math"); StringRef Denormal = F.getFnAttribute("denormal-fp-math").getValueAsString(); if (Denormal == "ieee") Options.FPDenormalMode = FPDenormal::IEEE; else if (Denormal == "preserve-sign") Options.FPDenormalMode = FPDenormal::PreserveSign; else if (Denormal == "positive-zero") Options.FPDenormalMode = FPDenormal::PositiveZero; else Options.FPDenormalMode = DefaultOptions.FPDenormalMode; } /// Returns the code generation relocation model. The choices are static, PIC, /// and dynamic-no-pic. Reloc::Model TargetMachine::getRelocationModel() const { return RM; } /// Returns the code model. The choices are small, kernel, medium, large, and /// target default. CodeModel::Model TargetMachine::getCodeModel() const { return CMModel; } /// Get the IR-specified TLS model for Var. static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) { switch (GV->getThreadLocalMode()) { case GlobalVariable::NotThreadLocal: llvm_unreachable("getSelectedTLSModel for non-TLS variable"); break; case GlobalVariable::GeneralDynamicTLSModel: return TLSModel::GeneralDynamic; case GlobalVariable::LocalDynamicTLSModel: return TLSModel::LocalDynamic; case GlobalVariable::InitialExecTLSModel: return TLSModel::InitialExec; case GlobalVariable::LocalExecTLSModel: return TLSModel::LocalExec; } llvm_unreachable("invalid TLS model"); } bool TargetMachine::shouldAssumeDSOLocal(const Module &M, const GlobalValue *GV) const { // If the IR producer requested that this GV be treated as dso local, obey. if (GV && GV->isDSOLocal()) return true; // If we are not supossed to use a PLT, we cannot assume that intrinsics are // local since the linker can convert some direct access to access via plt. if (M.getRtLibUseGOT() && !GV) return false; // According to the llvm language reference, we should be able to // just return false in here if we have a GV, as we know it is // dso_preemptable. At this point in time, the various IR producers // have not been transitioned to always produce a dso_local when it // is possible to do so. // In the case of intrinsics, GV is null and there is nowhere to put // dso_local. Returning false for those will produce worse code in some // architectures. For example, on x86 the caller has to set ebx before calling // a plt. // As a result we still have some logic in here to improve the quality of the // generated code. // FIXME: Add a module level metadata for whether intrinsics should be assumed // local. Reloc::Model RM = getRelocationModel(); const Triple &TT = getTargetTriple(); // DLLImport explicitly marks the GV as external. if (GV && GV->hasDLLImportStorageClass()) return false; // On MinGW, variables that haven't been declared with DLLImport may still // end up automatically imported by the linker. To make this feasible, // don't assume the variables to be DSO local unless we actually know // that for sure. This only has to be done for variables; for functions // the linker can insert thunks for calling functions from another DLL. - if (TT.isWindowsGNUEnvironment() && GV && GV->isDeclarationForLinker() && - isa(GV)) + if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() && GV && + GV->isDeclarationForLinker() && isa(GV)) return false; // On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols // remain unresolved in the link, they can be resolved to zero, which is // outside the current DSO. if (TT.isOSBinFormatCOFF() && GV && GV->hasExternalWeakLinkage()) return false; // Every other GV is local on COFF. // Make an exception for windows OS in the triple: Some firmware builds use // *-win32-macho triples. This (accidentally?) produced windows relocations // without GOT tables in older clang versions; Keep this behaviour. - if (TT.isOSBinFormatCOFF() || (TT.isOSWindows() && TT.isOSBinFormatMachO())) + // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables + // either. + if (TT.isOSBinFormatCOFF() || TT.isOSWindows()) return true; // Most PIC code sequences that assume that a symbol is local cannot // produce a 0 if it turns out the symbol is undefined. While this // is ABI and relocation depended, it seems worth it to handle it // here. if (GV && isPositionIndependent() && GV->hasExternalWeakLinkage()) return false; if (GV && !GV->hasDefaultVisibility()) return true; if (TT.isOSBinFormatMachO()) { if (RM == Reloc::Static) return true; return GV && GV->isStrongDefinitionForLinker(); } // Due to the AIX linkage model, any global with default visibility is // considered non-local. if (TT.isOSBinFormatXCOFF()) return false; assert(TT.isOSBinFormatELF() || TT.isOSBinFormatWasm()); assert(RM != Reloc::DynamicNoPIC); bool IsExecutable = RM == Reloc::Static || M.getPIELevel() != PIELevel::Default; if (IsExecutable) { // If the symbol is defined, it cannot be preempted. if (GV && !GV->isDeclarationForLinker()) return true; // A symbol marked nonlazybind should not be accessed with a plt. If the // symbol turns out to be external, the linker will convert a direct // access to an access via the plt, so don't assume it is local. const Function *F = dyn_cast_or_null(GV); if (F && F->hasFnAttribute(Attribute::NonLazyBind)) return false; bool IsTLS = GV && GV->isThreadLocal(); bool IsAccessViaCopyRelocs = GV && Options.MCOptions.MCPIECopyRelocations && isa(GV); Triple::ArchType Arch = TT.getArch(); bool IsPPC = Arch == Triple::ppc || Arch == Triple::ppc64 || Arch == Triple::ppc64le; // Check if we can use copy relocations. PowerPC has no copy relocations. if (!IsTLS && !IsPPC && (RM == Reloc::Static || IsAccessViaCopyRelocs)) return true; } // ELF & wasm support preemption of other symbols. return false; } bool TargetMachine::useEmulatedTLS() const { // Returns Options.EmulatedTLS if the -emulated-tls or -no-emulated-tls // was specified explicitly; otherwise uses target triple to decide default. if (Options.ExplicitEmulatedTLS) return Options.EmulatedTLS; return getTargetTriple().hasDefaultEmulatedTLS(); } TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const { bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default; Reloc::Model RM = getRelocationModel(); bool IsSharedLibrary = RM == Reloc::PIC_ && !IsPIE; bool IsLocal = shouldAssumeDSOLocal(*GV->getParent(), GV); TLSModel::Model Model; if (IsSharedLibrary) { if (IsLocal) Model = TLSModel::LocalDynamic; else Model = TLSModel::GeneralDynamic; } else { if (IsLocal) Model = TLSModel::LocalExec; else Model = TLSModel::InitialExec; } // If the user specified a more specific model, use that. TLSModel::Model SelectedModel = getSelectedTLSModel(GV); if (SelectedModel > Model) return SelectedModel; return Model; } /// Returns the optimization level: None, Less, Default, or Aggressive. CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; } void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; } TargetTransformInfo TargetMachine::getTargetTransformInfo(const Function &F) { return TargetTransformInfo(F.getParent()->getDataLayout()); } void TargetMachine::getNameWithPrefix(SmallVectorImpl &Name, const GlobalValue *GV, Mangler &Mang, bool MayAlwaysUsePrivate) const { if (MayAlwaysUsePrivate || !GV->hasPrivateLinkage()) { // Simple case: If GV is not private, it is not important to find out if // private labels are legal in this case or not. Mang.getNameWithPrefix(Name, GV, false); return; } const TargetLoweringObjectFile *TLOF = getObjFileLowering(); TLOF->getNameWithPrefix(Name, GV, *this); } MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV) const { const TargetLoweringObjectFile *TLOF = getObjFileLowering(); SmallString<128> NameStr; getNameWithPrefix(NameStr, GV, TLOF->getMangler()); return TLOF->getContext().getOrCreateSymbol(NameStr); } TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { // Since Analysis can't depend on Target, use a std::function to invert the // dependency. return TargetIRAnalysis( [this](const Function &F) { return this->getTargetTransformInfo(F); }); } Index: vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.cpp (revision 351709) @@ -1,45520 +1,45530 @@ //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include using namespace llvm; #define DEBUG_TYPE "x86-isel" STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt ExperimentalVectorWideningLegalization( "x86-experimental-vector-widening-legalization", cl::init(false), cl::desc("Enable an experimental vector type legalization through widening " "rather than promotion."), cl::Hidden); static cl::opt ExperimentalPrefLoopAlignment( "x86-experimental-pref-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments " "(the last x86-experimental-pref-loop-alignment bits" " of the loop header PC will be 0)."), cl::Hidden); static cl::opt MulConstantOptimization( "mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden); /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without /// crashing. static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, const char *Msg) { MachineFunction &MF = DAG.getMachineFunction(); DAG.getContext()->diagnose( DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); } X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); X86ScalarSSEf64 = Subtarget.hasSSE2(); X86ScalarSSEf32 = Subtarget.hasSSE1(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); // Set up the TargetLowering object. // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // For 64-bit, since we have so many registers, use the ILP scheduler. // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget.isAtom()) setSchedulingPreference(Sched::ILP); else if (Subtarget.is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides and use cheaper ones. if (TM.getOptLevel() >= CodeGenOpt::Default) { if (Subtarget.hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) addBypassSlowDiv(64, 32); } if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); } if (Subtarget.isTargetDarwin()) { // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(false); setUseUnderscoreLongJmp(false); } else if (Subtarget.isTargetWindowsGNU()) { // MS runtime is weird: it exports _setjmp, but longjmp! setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(false); } else { setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); } // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b. // FIXME: Should we be limitting the atomic size on other configs? Default is // 1024. if (!Subtarget.hasCmpxchg8b()) setMaxAtomicSizeInBitsSupported(32); // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget.is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // SETOEQ and SETUNE require checking two conditions. setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); // Integer absolute. if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); setOperationAction(ISD::ABS , MVT::i32 , Custom); } setOperationAction(ISD::ABS , MVT::i64 , Custom); // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { setOperationAction(ShiftOp , MVT::i16 , Custom); setOperationAction(ShiftOp , MVT::i32 , Custom); if (Subtarget.is64Bit()) setOperationAction(ShiftOp , MVT::i64 , Custom); } // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget.is64Bit()) { if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) // f32/f64 are legal, f80 is custom. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); else setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!Subtarget.useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } else { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand); } // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); if (!Subtarget.useSoftFloat()) { // SSE has no i16 to fp conversion, only i32. if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand); } // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); if (!Subtarget.useSoftFloat()) { // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); } // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget.is64Bit()) { if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } else { setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); } } else if (!Subtarget.useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget.hasSSE1() && !Subtarget.hasSSE3()) // Expand FP_TO_UINT into a select. // FIXME: We would like to use a Custom expander here eventually to do // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } else if (!Subtarget.is64Bit()) setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes // the two-result form to trivial CSE, which is able to combine x/y and x%y // into a single instruction. // // Scalar integer multiply-high is also lowered to use two-result // operations, to match the available instructions. However, plain multiply // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::BR_CC, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); } if (Subtarget.is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); if (!Subtarget.hasBMI()) { setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); if (Subtarget.is64Bit()) { setOperationAction(ISD::CTTZ , MVT::i64 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); } } if (Subtarget.hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); } } // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); else setOperationAction(ISD::CTPOP , MVT::i64 , Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); if (!Subtarget.hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // These should be promoted to a larger select which is supported. setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } // Custom action for SELECT MMX and expand action for SELECT_CC MMX setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since // LLVM/Clang supports zero-cost DWARF and SEH exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); if (TM.Options.ExceptionModel == ExceptionHandling::SjLj) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); // Darwin ABI issue. for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::ConstantPool , VT, Custom); setOperationAction(ISD::JumpTable , VT, Custom); setOperationAction(ISD::GlobalAddress , VT, Custom); setOperationAction(ISD::GlobalTLSAddress, VT, Custom); setOperationAction(ISD::ExternalSymbol , VT, Custom); setOperationAction(ISD::BlockAddress , VT, Custom); } // 64-bit shl, sra, srl (iff 32-bit x86) for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SHL_PARTS, VT, Custom); setOperationAction(ISD::SRA_PARTS, VT, Custom); setOperationAction(ISD::SRL_PARTS, VT, Custom); } if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } if (!Subtarget.is64Bit()) setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); if (Subtarget.hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() && TM.Options.ExceptionModel != ExceptionHandling::SjLj) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); bool Is64Bit = Subtarget.is64Bit(); setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass); addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass); // Disable f32->f64 extload as we can only generate this in one instruction // under optsize. So its easier to pattern match (fpext (load)) for that // case instead of needing to emit 2 instructions for extload in the // non-optsize case. setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); for (auto VT : { MVT::f32, MVT::f64 }) { // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS, VT, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG, VT, Custom); // Use ANDPD and ORPD to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, VT, Custom); // These might be better off as horizontal vector ops. setOperationAction(ISD::FADD, VT, Custom); setOperationAction(ISD::FSUB, VT, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); if (UseX87) addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f32, Custom); if (UseX87) setOperationAction(ISD::UNDEF, MVT::f64, Expand); // Use ANDPS and ORPS to simulate FCOPYSIGN. if (UseX87) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (UseX87) { // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (UseX87) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); for (auto VT : { MVT::f32, MVT::f64 }) { setOperationAction(ISD::UNDEF, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } } // Expand FP32 immediates into loads from the stack, save special cases. if (isTypeLegal(MVT::f32)) { if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) { addLegalFPImmediate(APFloat(+0.0f)); // FLD0 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0f)); // xorps } // Expand FP64 immediates into loads from the stack, save special cases. if (isTypeLegal(MVT::f64)) { if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) { addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87, except f128 in MMX. if (UseX87) { if (Subtarget.is64Bit() && Subtarget.hasMMX()) { addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); setOperationAction(ISD::FABS , MVT::f128, Custom); setOperationAction(ISD::FNEG , MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); } addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended()); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); setOperationAction(ISD::LROUND, MVT::f80, Expand); setOperationAction(ISD::LLROUND, MVT::f80, Expand); setOperationAction(ISD::LRINT, MVT::f80, Expand); setOperationAction(ISD::LLRINT, MVT::f80, Expand); } // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); setOperationAction(ISD::FMINNUM, MVT::f80, Expand); setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // Some FP actions are always expanded for vector types. for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); } // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like // types, we have to deal with them whether we ask for Expansion or not. // Setting Expand causes its own optimisation problems though, so leave // them legal. if (VT.getVectorElementType() == MVT::i1) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are // split/scalarized right now. if (VT.getVectorElementType() == MVT::f16) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::SREM, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::UREM, VT, Custom); } setOperationAction(ISD::MUL, MVT::v2i8, Custom); setOperationAction(ISD::MUL, MVT::v2i16, Custom); setOperationAction(ISD::MUL, MVT::v2i32, Custom); setOperationAction(ISD::MUL, MVT::v4i8, Custom); setOperationAction(ISD::MUL, MVT::v4i16, Custom); setOperationAction(ISD::MUL, MVT::v8i8, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v4i32, Custom); setOperationAction(ISD::MULHS, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i8, Custom); setOperationAction(ISD::MULHS, MVT::v16i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); } setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); if (!ExperimentalVectorWideningLegalization) { // Use widening instead of promotion. for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16 }) { setOperationAction(ISD::UADDSAT, VT, Custom); setOperationAction(ISD::SADDSAT, VT, Custom); setOperationAction(ISD::USUBSAT, VT, Custom); setOperationAction(ISD::SSUBSAT, VT, Custom); } } setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); // Provide custom widening for v2f32 setcc. This is really for VLX when // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to // type legalization changing the result type to v4i1 during widening. // It works fine for SSE2 and is probably faster so no need to qualify with // VLX support. setOperationAction(ISD::SETCC, MVT::v2i32, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } // We support custom legalizing of sext and anyext loads for specific // memory vector types which we can load as a scalar (or sequence of // scalars) and extend in-register to a legal 128-bit vector type. For sext // loads these must work with a single scalar load. for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); } for (auto VT : { MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); if (VT == MVT::v2i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::SELECT, MVT::v4i32, Custom); setOperationAction(ISD::SELECT, MVT::v8i16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); // Custom legalize these to avoid over promotion or custom promotion. setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is // split again based on the input type, this will cause an AssertSExt i16 to // be emitted instead of an AssertZExt. This will allow packssdw followed by // packuswb to be used to truncate to v8i8. This is necessary since packusdw // isn't available until sse4.1. setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i16, Custom); setOperationAction(ISD::LOAD, MVT::v8i8, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i16, Custom); setOperationAction(ISD::STORE, MVT::v8i8, Custom); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v16i1, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); if (ExperimentalVectorWideningLegalization) { setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); } else { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom); } // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); } setOperationAction(ISD::ROTL, MVT::v4i32, Custom); setOperationAction(ISD::ROTL, MVT::v8i16, Custom); // With AVX512, expanding (and promoting the shifts) is better. if (!Subtarget.hasAVX512()) setOperationAction(ISD::ROTL, MVT::v16i8, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { setOperationAction(ISD::ABS, MVT::v16i8, Legal); setOperationAction(ISD::ABS, MVT::v8i16, Legal); setOperationAction(ISD::ABS, MVT::v4i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); // These might be better off as horizontal vector ops. setOperationAction(ISD::ADD, MVT::i16, Custom); setOperationAction(ISD::ADD, MVT::i32, Custom); setOperationAction(ISD::SUB, MVT::i16, Custom); setOperationAction(ISD::SUB, MVT::i32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); setOperationAction(ISD::FCEIL, RoundedTy, Legal); setOperationAction(ISD::FTRUNC, RoundedTy, Legal); setOperationAction(ISD::FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMAX, MVT::v4i32, Legal); setOperationAction(ISD::UMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v4i32, Legal); setOperationAction(ISD::SMIN, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v4i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); // We directly match byte blends in the backend as they match the VSELECT // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } if (!ExperimentalVectorWideningLegalization) { // Avoid narrow result types when widening. The legal types are listed // in the next loop. for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); } } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); if (!ExperimentalVectorWideningLegalization) setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); } // i8 vectors are custom because the source register and source // source memory operand types are not the same width. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) setOperationAction(ISD::ROTL, VT, Custom); // XOP can efficiently perform BITREVERSE with VPPERM. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) setOperationAction(ISD::BITREVERSE, VT, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) setOperationAction(ISD::BITREVERSE, VT, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) { bool HasInt256 = Subtarget.hasInt256(); addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); for (auto VT : { MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); } // These types need custom splitting if their input is a 128-bit vector. setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ROTL, MVT::v8i32, Custom); setOperationAction(ISD::ROTL, MVT::v16i16, Custom); // With BWI, expanding (and promoting the shifts) is the better. if (!Subtarget.hasBWI()) setOperationAction(ISD::ROTL, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8i32, Custom); setOperationAction(ISD::SELECT, MVT::v16i16, Custom); setOperationAction(ISD::SELECT, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); } setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::FMA, VT, Legal); } for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); } setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i32, Custom); setOperationAction(ISD::MULHS, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); setOperationAction(ISD::ABS, MVT::v4i64, Custom); setOperationAction(ISD::SMAX, MVT::v4i64, Custom); setOperationAction(ISD::UMAX, MVT::v4i64, Custom); setOperationAction(ISD::SMIN, MVT::v4i64, Custom); setOperationAction(ISD::UMIN, MVT::v4i64, Custom); setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); } for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } if (HasInt256) { // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); } } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Legal); } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); } // Custom lower several nodes for 256-bit types. for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); } if (HasInt256) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); if (HasInt256) { // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MGATHER, VT, Custom); } } // This block controls legalization of the mask vector sizes that are // available with AVX512. 512-bit vectors are in a separate block controlled // by useAVX512Regs. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); setOperationAction(ISD::SELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { setOperationAction(ISD::LOAD, MVT::v1i1, Custom); setOperationAction(ISD::LOAD, MVT::v2i1, Custom); setOperationAction(ISD::LOAD, MVT::v4i1, Custom); setOperationAction(ISD::LOAD, MVT::v8i1, Custom); setOperationAction(ISD::STORE, MVT::v1i1, Custom); setOperationAction(ISD::STORE, MVT::v2i1, Custom); setOperationAction(ISD::STORE, MVT::v4i1, Custom); setOperationAction(ISD::STORE, MVT::v8i1, Custom); } // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); } for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Custom); setOperationAction(ISD::SADDSAT, VT, Custom); setOperationAction(ISD::USUBSAT, VT, Custom); setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); } for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } // This block controls legalization for 512-bit operations with 32/64 bit // elements. 512-bits can be disabled based on prefer-vector-width and // required-vector-width function attributes. if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); } for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE // to 512-bit rather than use the AVX2 instructions so that we can use // k-masks. if (!Subtarget.hasVLX()) { for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom); } } setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); if (ExperimentalVectorWideningLegalization) { // Need to custom widen this if we don't have AVX512BW. setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); } for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); } // Without BWI we need to use custom lowering to handle MVT::v64i8 input. for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::MULHU, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v16i32, Custom); for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } if (Subtarget.hasCDI()) { // NonVLX sub-targets extend 128/256 vectors to use the 512 version. for (auto VT : { MVT::v16i32, MVT::v8i64} ) { setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v16i32, MVT::v8i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } // Extract subvector is special because the value type // (result) is 256-bit but the source is 512-bit wide. // 128-bit was made Legal under AVX1. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } // Need to custom split v32i16/v64i8 bitcasts. if (!Subtarget.hasBWI()) { setOperationAction(ISD::BITCAST, MVT::v32i16, Custom); setOperationAction(ISD::BITCAST, MVT::v64i8, Custom); } if (Subtarget.hasVBMI2()) { for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } } }// has AVX-512 // This block controls legalization for operations that don't have // pre-AVX512 equivalents. Without VLX we use 512-bit operations for // narrower widths. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // These operations are handled on non-VLX by artificially widening in // isel patterns. // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom); setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom); for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MSCATTER, VT, Custom); if (Subtarget.hasDQI()) { for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SINT_TO_FP, VT, Legal); setOperationAction(ISD::UINT_TO_FP, VT, Legal); setOperationAction(ISD::FP_TO_SINT, VT, Legal); setOperationAction(ISD::FP_TO_UINT, VT, Legal); setOperationAction(ISD::MUL, VT, Legal); } } if (Subtarget.hasCDI()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } } // This block control legalization of v32i1/v64i1 which are available with // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with // useBWIRegs. if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); for (auto VT : { MVT::v32i1, MVT::v64i1 }) { setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::UADDSAT, VT, Custom); setOperationAction(ISD::SADDSAT, VT, Custom); setOperationAction(ISD::USUBSAT, VT, Custom); setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); for (auto VT : { MVT::v16i1, MVT::v32i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Extends from v32i1 masks to 256-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); } // This block controls legalization for v32i16 and v64i8. 512-bits can be // disabled based on prefer-vector-width and required-vector-width function // attributes. if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); // Extends from v64i1 masks to 512-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::MUL, MVT::v32i16, Legal); setOperationAction(ISD::MUL, MVT::v64i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i16, Legal); setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::MULHS, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v64i8, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); } if (Subtarget.hasBITALG()) { for (auto VT : { MVT::v64i8, MVT::v32i16 }) setOperationAction(ISD::CTPOP, VT, Legal); } if (Subtarget.hasVBMI2()) { setOperationAction(ISD::FSHL, MVT::v32i16, Custom); setOperationAction(ISD::FSHR, MVT::v32i16, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); } // These operations are handled on non-VLX by artificially widening in // isel patterns. // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? if (Subtarget.hasBITALG()) { for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) setOperationAction(ISD::CTPOP, VT, Legal); } } if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. // v2f32 UINT_TO_FP is already custom under SSE2. setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); } if (Subtarget.hasBWI()) { setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); } if (Subtarget.hasVBMI2()) { // TODO: Make these legal even without VLX? for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } } } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); // Support carry in as value rather than glue. setOperationAction(ISD::ADDCARRY, VT, Custom); setOperationAction(ISD::SUBCARRY, VT, Custom); setOperationAction(ISD::SETCCCARRY, VT, Custom); } if (!Subtarget.is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); setLibcallName(RTLIB::MUL_I128, nullptr); } // Combine sin / cos into _sincos_stret if it is available. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); setOperationAction(ISD::UREM, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i128, Custom); setOperationAction(ISD::UDIVREM, MVT::i128, Custom); } // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. if (Subtarget.is32Bit() && (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) for (ISD::NodeType Op : {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, ISD::FLOG10, ISD::FPOW, ISD::FSIN}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget.getRegisterInfo()); MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; // TODO: These control memcmp expansion in CGP and could be raised higher, but // that needs to benchmarked and balanced with the potential use of vector // load/store types (PR33329, PR33914). MaxLoadsPerMemcmp = 2; MaxLoadsPerMemcmpOptSize = 2; // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). setPrefLoopAlignment(ExperimentalPrefLoopAlignment); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. verifyIntrinsicTables(); } // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); } bool X86TargetLowering::useStackGuardXorFP() const { // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. return Subtarget.getTargetTriple().isOSMSVCRT(); } SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const { EVT PtrTy = getPointerTy(DAG.getDataLayout()); unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP; MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val); return SDValue(Node, 0); } TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(MVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; if (ExperimentalVectorWideningLegalization && VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return MVT::v32i8; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return 1; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext& Context, EVT VT) const { if (!VT.isVector()) return MVT::i8; if (Subtarget.hasAVX512()) { const unsigned NumElts = VT.getVectorNumElements(); // Figure out what this type will be legalized to. EVT LegalVT = VT; while (getTypeAction(Context, LegalVT) != TypeLegal) LegalVT = getTypeToTransformTo(Context, LegalVT); // If we got a 512-bit vector then we'll definitely have a vXi1 compare. if (LegalVT.getSimpleVT().is512BitVector()) return EVT::getVectorVT(Context, MVT::i1, NumElts); if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { // If we legalized to less than a 512-bit vector, then we will use a vXi1 // compare for vXi32/vXi64 for sure. If we have BWI we will also support // vXi16/vXi8. MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) return EVT::getVectorVT(Context, MVT::i1, NumElts); } } return VT.changeVectorElementTypeToInteger(); } /// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) return; if (VectorType *VTy = dyn_cast(Ty)) { if (VTy->getBitWidth() == 128) MaxAlign = 16; } else if (ArrayType *ATy = dyn_cast(Ty)) { unsigned EltAlign = 0; getMaxByValAlign(ATy->getElementType(), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast(Ty)) { for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) break; } } } /// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, const DataLayout &DL) const { if (Subtarget.is64Bit()) { // Max of 8 and alignment of type. unsigned TyAlign = DL.getABITypeAlignment(Ty); if (TyAlign > 8) return TyAlign; return 8; } unsigned Align = 4; if (Subtarget.hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. /// For vector ops we check that the overall size isn't larger than our /// preferred vector width. EVT X86TargetLowering::getOptimalMemOpType( uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const { if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { // FIXME: Check if unaligned 32-byte accesses are slow. if (Size >= 32 && Subtarget.hasAVX() && (Subtarget.getPreferVectorWidth() >= 256)) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we // choose an optimal type with a vector element larger than a byte, // getMemsetStores() may create an intermediate splat (using an integer // multiply) before we splat as a vector. return MVT::v32i8; } if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v16i8; // TODO: Can SSE1 handle a byte vector? // If we have SSE1 registers we should be able to use them. if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v4f32; } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. // Also, do not use f64 to lower memset unless this is a memset of zeros. // The gymnastics of splatting a byte value into an XMM register and then // only using 8-byte stores (because this is a CPU with slow unaligned // 16-byte accesses) makes that a loser. return MVT::f64; } } // This is a compromise. If we reach here, unaligned accesses may be slow on // this target. However, creating smaller, aligned accesses could be even // slower and would certainly be a lot more code. if (Subtarget.is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; } bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) return X86ScalarSSEf32; else if (VT == MVT::f64) return X86ScalarSSEf64; return true; } bool X86TargetLowering::allowsMisalignedMemoryAccesses( EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const { if (Fast) { switch (VT.getSizeInBits()) { default: // 8-byte and under are always assumed to be fast. *Fast = true; break; case 128: *Fast = !Subtarget.isUnalignedMem16Slow(); break; case 256: *Fast = !Subtarget.isUnalignedMem32Slow(); break; // TODO: What about AVX-512 (512-bit) accesses? } } // NonTemporal vector memory ops must be aligned. if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { // NT loads can only be vector aligned, so if its less aligned than the // minimum vector size (which we can split the vector down to), we might as // well use a regular unaligned vector load. // We don't have any NT loads pre-SSE41. if (!!(Flags & MachineMemOperand::MOLoad)) return (Align < 16 || !Subtarget.hasSSE41()); return false; } // Misaligned accesses of any size are always allowed. return true; } /// Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF // symbol. if (isPositionIndependent() && Subtarget.isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; // Otherwise, use the normal jump table encoding heuristics. return TargetLowering::getJumpTableEncoding(); } bool X86TargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, ArgListTy &Args) const { // Only relabel X86-32 for C / Stdcall CCs. if (Subtarget.is64Bit()) return; if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) return; unsigned ParamRegs = 0; if (auto *M = MF->getFunction().getParent()) ParamRegs = M->getNumberRegisterParameters(); // Mark the first N int arguments as having reg for (unsigned Idx = 0; Idx < Args.size(); Idx++) { Type *T = Args[Idx].Ty; if (T->isIntOrPtrTy()) if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { unsigned numRegs = 1; if (MF->getDataLayout().getTypeAllocSize(T) > 4) numRegs = 2; if (ParamRegs < numRegs) return; ParamRegs -= numRegs; Args[Idx].IsInReg = true; } } } const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. return MCSymbolRefExpr::create(MBB->getSymbol(), MCSymbolRefExpr::VK_GOTOFF, Ctx); } /// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget.is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())); return Table; } /// This returns the relocation base for the given PIC jumptable, /// the same as getPICJumpTableRelocBase, but as an MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { // X86-64 uses RIP relative addressing based on the jump table label. if (Subtarget.isPICStyleRIPRel()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); } std::pair X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(TRI, VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; break; case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: case MVT::v16f32: case MVT::v8f64: RRC = &X86::VR128XRegClass; break; } return std::make_pair(RRC, Cost); } unsigned X86TargetLowering::getAddressSpace() const { if (Subtarget.is64Bit()) return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; return 256; } static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); } static Constant* SegmentOffset(IRBuilder<> &IRB, unsigned Offset, unsigned AddressSpace) { return ConstantExpr::getIntToPtr( ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); } Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { // glibc, bionic, and Fuchsia have a special slot for the stack guard in // tcbhead_t; use it instead of the usual global variable (see // sysdeps/{i386,x86_64}/nptl/tls.h) if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { if (Subtarget.isTargetFuchsia()) { // defines ZX_TLS_STACK_GUARD_OFFSET with this value. return SegmentOffset(IRB, 0x10, getAddressSpace()); } else { // %fs:0x28, unless we're using a Kernel code model, in which case // it's %gs:0x28. gs:0x14 on i386. unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; return SegmentOffset(IRB, Offset, getAddressSpace()); } } return TargetLowering::getIRStackGuard(IRB); } void X86TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { // MSVC CRT has a global variable holding security cookie. M.getOrInsertGlobal("__security_cookie", Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( "__security_check_cookie", Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext())); if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) { F->setCallingConv(CallingConv::X86_FastCall); F->addAttribute(1, Attribute::AttrKind::InReg); } return; } // glibc, bionic, and Fuchsia have a special slot for the stack guard. if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) return; TargetLowering::insertSSPDeclarations(M); } Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { // MSVC CRT has a global variable holding security cookie. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { return M.getGlobalVariable("__security_cookie"); } return TargetLowering::getSDagStackGuard(M); } Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { return M.getFunction("__security_check_cookie"); } return TargetLowering::getSSPStackGuardCheck(M); } Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (Subtarget.getTargetTriple().isOSContiki()) return getDefaultSafeStackPointerLocation(IRB, false); // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h if (Subtarget.isTargetAndroid()) { // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: // %gs:0x24 on i386 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; return SegmentOffset(IRB, Offset, getAddressSpace()); } // Fuchsia is similar. if (Subtarget.isTargetFuchsia()) { // defines ZX_TLS_UNSAFE_SP_OFFSET with this value. return SegmentOffset(IRB, 0x18, getAddressSpace()); } return TargetLowering::getSafeStackPointerLocation(IRB); } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); return SrcAS < 256 && DestAS < 256; } //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// bool X86TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } /// Lowers masks values (v*i1) to the local register values /// \returns DAG node after lowering to register type static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, const SDLoc &Dl, SelectionDAG &DAG) { EVT ValVT = ValArg.getValueType(); if (ValVT == MVT::v1i1) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg, DAG.getIntPtrConstant(0, Dl)); if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { // Two stage lowering might be required // bitcast: v8i1 -> i8 / v16i1 -> i16 // anyextend: i8 -> i32 / i16 -> i32 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); if (ValLoc == MVT::i32) ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy); return ValToCopy; } if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { // One stage lowering is required // bitcast: v32i1 -> i32 / v64i1 -> i64 return DAG.getBitcast(ValLoc, ValArg); } return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg); } /// Breaks v64i1 value into two registers and adds the new node to the DAG static void Passv64i1ArgInRegs( const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, SmallVector, 8> &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, const X86Subtarget &Subtarget) { assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); assert(VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"); // Before splitting the value we cast it to i64 Arg = DAG.getBitcast(MVT::i64, Arg); // Splitting the value into two i32 types SDValue Lo, Hi; Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, DAG.getConstant(0, Dl, MVT::i32)); Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, DAG.getConstant(1, Dl, MVT::i32)); // Attach the two i32 types into corresponding registers RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); } SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); // In some cases we need to disable registers from the default CSR list. // For example, when they are used for argument passing. bool ShouldDisableCalleeSavedRegister = CallConv == CallingConv::X86_RegCall || MF.getFunction().hasFnAttribute("no_caller_saved_registers"); if (CallConv == CallingConv::X86_INTR && !Outs.empty()) report_fatal_error("X86 interrupts may not return any value"); SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, MVT::i32)); // Copy the result values into the output registers. for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; ++I, ++OutsIndex) { CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); // Add the register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); SDValue ValToCopy = OutVals[OutsIndex]; EVT ValVT = ValToCopy.getValueType(); // Promote values to the appropriate types. if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); } else if (VA.getLocInfo() == CCValAssign::BCvt) ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } else if (ValVT == MVT::f64 && (Subtarget.is64Bit() && !Subtarget.hasSSE2())) { // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); RetOps.push_back(ValToCopy); // Don't emit a copytoreg. continue; } // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. if (Subtarget.is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget.hasSSE2()) ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); } } } SmallVector, 8> RegsToPass; if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I], Subtarget); assert(2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"); // Add the second register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); } else { RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); } // Add nodes to the DAG and add the values into the RetOps list for (auto &Reg : RegsToPass) { Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); } } // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. // All x86 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. // // Checking Function.hasStructRetAttr() here is insufficient because the IR // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is // false, then an sret argument may be implicitly inserted in the SelDAG. In // either case FuncInfo->setSRetReturnReg() will have been called. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { // When we have both sret and another return value, we should use the // original Chain stored in RetOps[0], instead of the current Chain updated // in the above loop. If we only have sret, RetOps[0] equals to Chain. // For the case of sret and another return value, we have // Chain_0 at the function entry // Chain_1 = getCopyToReg(Chain_0) in the above loop // If we use Chain_1 in getCopyFromReg, we will have // Val = getCopyFromReg(Chain_1) // Chain_2 = getCopyToReg(Chain_1, Val) from below // getCopyToReg(Chain_0) will be glued together with // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be // in Unit B, and we will have cyclic dependency between Unit A and Unit B: // Data dependency from Unit B to Unit A due to usage of Val in // getCopyToReg(Chain_1, Val) // Chain dependency from Unit A to Unit B // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, getPointerTy(MF.getDataLayout())); unsigned RetValReg = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); // Add the returned register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(RetValReg); } const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (X86::GR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); X86ISD::NodeType opcode = X86ISD::RET_FLAG; if (CallConv == CallingConv::X86_INTR) opcode = X86ISD::IRET; return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; // If we are returning more than one value, we can definitely // not make a tail call see PR19530 if (UI->getNumOperands() > 4) return false; if (UI->getNumOperands() == 4 && UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT = MVT::i32; bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { // The ABI does not require i1, i8 or i16 to be extended. // // On Darwin, there is code in the wild relying on Clang's old behaviour of // always extending i8/i16 return values, so keep doing that for now. // (PR26665). ReturnMVT = MVT::i8; } EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } /// Reads two 32 bit registers and creates a 64 bit mask value. /// \param VA The current 32 bit value that need to be assigned. /// \param NextVA The next 32 bit value that need to be assigned. /// \param Root The parent DAG node. /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for /// glue purposes. In the case the DAG is already using /// physical register instead of virtual, we should glue /// our new SDValue to InFlag SDvalue. /// \return a new SDvalue of size 64bit. static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, const SDLoc &Dl, const X86Subtarget &Subtarget, SDValue *InFlag = nullptr) { assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); assert(VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"); assert(NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"); assert(VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"); SDValue Lo, Hi; SDValue ArgValueLo, ArgValueHi; MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterClass *RC = &X86::GR32RegClass; // Read a 32 bit value from the registers. if (nullptr == InFlag) { // When no physical register is present, // create an intermediate virtual register. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); } else { // When a physical register is available read the value from it and glue // the reads together. ArgValueLo = DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag); *InFlag = ArgValueLo.getValue(2); ArgValueHi = DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag); *InFlag = ArgValueHi.getValue(2); } // Convert the i32 type into v32i1 type. Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); // Convert the i32 type into v32i1 type. Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); // Concatenate the two values together. return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); } /// The function will lower a register of various sizes (8/16/32/64) /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) /// \returns a DAG node contains the operand after lowering to mask type. static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, const EVT &ValLoc, const SDLoc &Dl, SelectionDAG &DAG) { SDValue ValReturned = ValArg; if (ValVT == MVT::v1i1) return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); if (ValVT == MVT::v64i1) { // In 32 bit machine, this case is handled by getv64i1Argument assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); // In 64 bit machine, There is no need to truncate the value only bitcast } else { MVT maskLen; switch (ValVT.getSimpleVT().SimpleTy) { case MVT::v8i1: maskLen = MVT::i8; break; case MVT::v16i1: maskLen = MVT::i16; break; case MVT::v32i1: maskLen = MVT::i32; break; default: llvm_unreachable("Expecting a vector of i1 types"); } ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); } return DAG.getBitcast(ValVT, ValReturned); } /// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue X86TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, uint32_t *RegMask) const { const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Assign locations to each value returned by this call. SmallVector RVLocs; bool Is64Bit = Subtarget.is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; ++I, ++InsIndex) { CCValAssign &VA = RVLocs[I]; EVT CopyVT = VA.getLocVT(); // In some calling conventions we need to remove the used registers // from the register mask. if (RegMask) { for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); } // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. bool RoundAfterCopy = false; if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { if (!Subtarget.hasX87()) report_fatal_error("X87 register return with X87 disabled"); CopyVT = MVT::f80; RoundAfterCopy = (CopyVT != VA.getLocVT()); } SDValue Val; if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); Val = getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag); } else { Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag) .getValue(1); Val = Chain.getValue(0); InFlag = Chain.getValue(2); } if (RoundAfterCopy) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1, dl)); if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) { if (VA.getValVT().isVector() && ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); } else Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); } InVals.push_back(Val); } return Chain; } //===----------------------------------------------------------------------===// // C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// // StdCall calling convention seems to be standard for many Windows' API // routines and around. It differs from C calling convention just a little: // callee should clean up the stack, not caller. Symbols should be also // decorated in some fancy way :) It doesn't support any vector arguments. // For info on fast calling convention see Fast Calling Convention (tail call) // implementation LowerX86_32FastCCCallTo. /// CallIsStructReturn - Determines whether a call uses struct return /// semantics. enum StructReturnType { NotStructReturn, RegStructReturn, StackStructReturn }; static StructReturnType callIsStructReturn(ArrayRef Outs, bool IsMCU) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType argsAreStructReturn(ArrayRef Ins, bool IsMCU) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Make a copy of an aggregate at address specified by "Src" to address /// "Dst" with size and alignment information specified by the specific /// parameter attribute. The copy will be passed as a byval function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile*/false, /*AlwaysInline=*/true, /*isTailCall*/false, MachinePointerInfo(), MachinePointerInfo()); } /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || CC == CallingConv::HHVM); } /// Return true if we might ever do TCO for calls with this calling convention. static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { // C calling conventions: case CallingConv::C: case CallingConv::Win64: case CallingConv::X86_64_SysV: // Callee pop conventions: case CallingConv::X86_ThisCall: case CallingConv::X86_StdCall: case CallingConv::X86_VectorCall: case CallingConv::X86_FastCall: // Swift: case CallingConv::Swift: return true; default: return canGuaranteeTCO(CC); } } /// Return true if the function is being made into a tailcall target by /// changing its ABI. static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && canGuaranteeTCO(CC); } bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; ImmutableCallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); if (!mayTailCallThisCC(CalleeCC)) return false; return true; } SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo &MFI, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = shouldGuaranteeTCO( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; MVT PtrVT = getPointerTy(DAG.getDataLayout()); // If value is passed by pointer we have address passed instead of the value // itself. No need to extend if the mask value and location share the same // absolute size. bool ExtendedInMem = VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) ValVT = VA.getLocVT(); else ValVT = VA.getValVT(); // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. if (Flags.isByVal()) { unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. // FIXME: For now, all byval parameter objects are marked as aliasing. This // can be improved with deeper analysis. int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, /*isAliased=*/true); return DAG.getFrameIndex(FI, PtrVT); } // This is an argument in memory. We might be able to perform copy elision. // If the argument is passed directly in memory without any extension, then we // can perform copy elision. Large vector types, for example, may be passed // indirectly by pointer. if (Flags.isCopyElisionCandidate() && VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) { EVT ArgVT = Ins[i].ArgVT; SDValue PartAddr; if (Ins[i].PartOffset == 0) { // If this is a one-part value or the first part of a multi-part value, // create a stack object for the entire argument value type and return a // load from our portion of it. This assumes that if the first part of an // argument is in memory, the rest will also be in memory. int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), /*IsImmutable=*/false); PartAddr = DAG.getFrameIndex(FI, PtrVT); return DAG.getLoad( ValVT, dl, Chain, PartAddr, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); } else { // This is not the first piece of an argument in memory. See if there is // already a fixed stack object including this offset. If so, assume it // was created by the PartOffset == 0 branch above and create a load from // the appropriate offset into it. int64_t PartBegin = VA.getLocMemOffset(); int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; int FI = MFI.getObjectIndexBegin(); for (; MFI.isFixedObjectIndex(FI); ++FI) { int64_t ObjBegin = MFI.getObjectOffset(FI); int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) break; } if (MFI.isFixedObjectIndex(FI)) { SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); return DAG.getLoad( ValVT, dl, Chain, Addr, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, Ins[i].PartOffset)); } } } int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, VA.getLocMemOffset(), isImmutable); // Set SExt or ZExt flag. if (VA.getLocInfo() == CCValAssign::ZExt) { MFI.setObjectZExt(FI, true); } else if (VA.getLocInfo() == CCValAssign::SExt) { MFI.setObjectSExt(FI, true); } SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); return ExtendedInMem ? (VA.getValVT().isVector() ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) : Val; } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentGPRs(CallingConv::ID CallConv, const X86Subtarget &Subtarget) { assert(Subtarget.is64Bit()); if (Subtarget.isCallingConvWin64(CallConv)) { static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); } static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, CallingConv::ID CallConv, const X86Subtarget &Subtarget) { assert(Subtarget.is64Bit()); if (Subtarget.isCallingConvWin64(CallConv)) { // The XMM registers which might contain var arg parameters are shadowed // in their paired GPR. So we only need to save the GPR to their home // slots. // TODO: __vectorcall will change this. return None; } const Function &F = MF.getFunction(); bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool isSoftFloat = Subtarget.useSoftFloat(); assert(!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1()) // Kernel mode asks for SSE to be disabled, so there are no XMM argument // registers. return None; static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } #ifndef NDEBUG static bool isSortedByValueNo(ArrayRef ArgLocs) { return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), [](const CCValAssign &A, const CCValAssign &B) -> bool { return A.getValNo() < B.getValNo(); }); } #endif SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const Function &F = MF.getFunction(); if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && F.getName() == "main") FuncInfo->setForceFramePointer(true); MachineFrameInfo &MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget.is64Bit(); bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); assert( !(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeArguments(Ins, CC_X86); // In vectorcall calling convention a second pass is required for the HVA // types. if (CallingConv::X86_VectorCall == CallConv) { CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); } // The next loop assumes that the locations are in the same order of the // input arguments. assert(isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"); SDValue ArgValue; for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; ++I, ++InsIndex) { assert(InsIndex < Ins.size() && "Invalid Ins index"); CCValAssign &VA = ArgLocs[I]; if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); if (VA.needsCustom()) { assert( VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); // v64i1 values, in regcall calling convention, that are // compiled to 32 bit arch, are split up into two registers. ArgValue = getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); } else { const TargetRegisterClass *RC; if (RegVT == MVT::i8) RC = &X86::GR8RegClass; else if (RegVT == MVT::i16) RC = &X86::GR16RegClass; else if (RegVT == MVT::i32) RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; else if (RegVT == MVT::f32) RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; else if (RegVT == MVT::f80) RC = &X86::RFP80RegClass; else if (RegVT == MVT::f128) RC = &X86::VR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; else if (RegVT == MVT::v1i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; else if (RegVT == MVT::v32i1) RC = &X86::VK32RegClass; else if (RegVT == MVT::v64i1) RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); } // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the // right size. if (VA.getLocInfo() == CCValAssign::SExt) ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::ZExt) ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); else if (VA.getValVT().isVector() && VA.getValVT().getScalarType() == MVT::i1 && ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); } else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { assert(VA.isMemLoc()); ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); } // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal()) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); InVals.push_back(ArgValue); } for (unsigned I = 0, E = Ins.size(); I != E; ++I) { // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. if (CallConv == CallingConv::Swift) continue; // All x86 ABIs require that for returning structs by value we copy the // sret argument into %rax/%eax (depending on ABI) for the return. Save // the argument into a virtual register so that we can access it from the // return points. if (Ins[I].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); break; } } unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. if (shouldGuaranteeTCO(CallConv, MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. We // can skip this if there are no va_start calls. if (MFI.hasVAStart() && (Is64Bit || (CallConv != CallingConv::X86_FastCall && CallConv != CallingConv::X86_ThisCall))) { FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true)); } // Figure out if XMM registers are in use. assert(!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. if (Is64Bit && isVarArg && MFI.hasVAStart()) { // Find the first unallocated argument registers. ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. SmallVector LiveGPRs; SmallVector LiveXMMRegs; SDValue ALVal; for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); LiveGPRs.push_back( DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); } if (!ArgXMMs.empty()) { unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); LiveXMMRegs.push_back( DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); } } if (IsWin64) { // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; FuncInfo->setRegSaveFrameIndex( MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); // Fixup to set vararg frame on shadow area (4 x i64). if (NumIntRegs < 4) FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so // they may be loaded by dereferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); } // Store the integer parameter registers. SmallVector MemOps; SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy(DAG.getDataLayout())); unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (SDValue Val : LiveGPRs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), Offset)); MemOps.push_back(Store); Offset += 8; } if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { // Now store the XMM (fp + vector) parameter registers. SmallVector SaveXMMOps; SaveXMMOps.push_back(Chain); SaveXMMOps.push_back(ALVal); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getRegSaveFrameIndex(), dl)); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getVarArgsFPOffset(), dl)); SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), LiveXMMRegs.end()); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } if (isVarArg && MFI.hasMustTailInVarArgFunc()) { // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. if (Subtarget.hasAVX512() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || CallConv == CallingConv::Intel_OCL_BI))) VecVT = MVT::v16f32; else if (Subtarget.hasAVX()) VecVT = MVT::v8f32; else if (Subtarget.hasSSE2()) VecVT = MVT::v4f32; // We forward some GPRs and some vector types. SmallVector RegParmTypes; MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; RegParmTypes.push_back(IntVT); if (VecVT != MVT::Other) RegParmTypes.push_back(VecVT); // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); // Conservatively forward AL on x86_64, since it might be used for varargs. if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); } // Copy all forwards from physical to virtual registers. for (ForwardedRegister &FR : Forwards) { // FIXME: Can we use a less constrained schedule? SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); } } // Some CCs need callee pop. if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { // X86 interrupts must pop the error code (and the alignment padding) if // present. FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget.getTargetTriple().isOSMSVCRT() && argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } if (!Is64Bit) { // RegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } FuncInfo->setArgumentStackSize(StackSize); if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); if (Personality == EHPersonality::CoreCLR) { assert(Is64Bit); // TODO: Add a mechanism to frame lowering that will allow us to indicate // that we'd prefer this slot be allocated towards the bottom of the frame // (i.e. near the stack pointer after allocating the frame). Every // funclet needs a copy of this slot in its (mostly empty) frame, and the // offset from the bottom of this and each funclet's frame must be the // same, so the size of funclets' (mostly empty) frames is dictated by // how far this slot is from the bottom (since they allocate just enough // space to accommodate holding this slot at the correct offset). int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false); EHInfo->PSPSymFrameIdx = PSPSymFI; } } if (CallConv == CallingConv::X86_RegCall || F.hasFnAttribute("no_caller_saved_registers")) { MachineRegisterInfo &MRI = MF.getRegInfo(); for (std::pair Pair : MRI.liveins()) MRI.disableCalleeSavedRegister(Pair.first); } return Chain; } SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); return DAG.getStore( Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); } /// Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr( SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, const SDLoc &dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(DAG.getDataLayout()); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); return SDValue(OutRetAddr.getNode(), 1); } /// Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, unsigned SlotSize, int FPDiff, const SDLoc &dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), NewReturnAddrFI)); return Chain; } /// Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector Mask; Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) Mask.push_back(i); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; bool &isTailCall = CLI.IsTailCall; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget.is64Bit(); bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo(); auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); const auto *CI = dyn_cast_or_null(CLI.CS.getInstruction()); const Function *Fn = CI ? CI->getCalledFunction() : nullptr; bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) || (Fn && Fn->hasFnAttribute("no_caller_saved_registers")); const auto *II = dyn_cast_or_null(CLI.CS.getInstruction()); bool HasNoCfCheck = (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()); const Module *M = MF.getMMI().getModule(); Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); MachineFunction::CallSiteInfo CSInfo; if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); if (Attr.getValueAsString() == "true") isTailCall = false; if (Subtarget.isPICStyleGOT() && !MF.getTarget().Options.GuaranteedTailCallOpt) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT // relocation, which forces early binding of the symbol. This breaks code // that require lazy function symbol resolution. Using musttail or // GuaranteedTailCallOpt will override this. GlobalAddressSDNode *G = dyn_cast(Callee); if (!G || (!G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility())) isTailCall = false; } bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure // that we can lower this successfully without moving the return address // around. isTailCall = true; } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, MF.getFunction().hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require // ABI changes. if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) ++NumTailCalls; } assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeArguments(Outs, CC_X86); // In vectorcall calling convention a second pass is required for the HVA // types. if (CallingConv::X86_VectorCall == CallConv) { CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); } // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (MF.getTarget().Options.GuaranteedTailCallOpt && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; // Set the delta of movement of the returnaddr stackslot. // But only set if delta is greater than previous delta. if (FPDiff < X86Info->getTCReturnAddrDelta()) X86Info->setTCReturnAddrDelta(FPDiff); } unsigned NumBytesToPush = NumBytes; unsigned NumBytesToPop = NumBytes; // If we have an inalloca argument, all stack space has already been allocated // for us and be right at the top of the stack. We don't support multiple // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; if (!ArgLocs.back().isMemLoc()) report_fatal_error("cannot use inalloca attribute on a register " "parameter"); if (ArgLocs.back().getLocMemOffset() != 0) report_fatal_error("any parameter with the inalloca attribute must be " "the only memory argument"); } if (!IsSibcall) Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, NumBytes - NumBytesToPush, dl); SDValue RetAddrFrIdx; // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); SmallVector, 8> RegsToPass; SmallVector MemOpChains; SDValue StackPtr; // The next loop assumes that the locations are in the same order of the // input arguments. assert(isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"); // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutIndex) { assert(OutIndex < Outs.size() && "Invalid Out index"); // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; if (Flags.isInAlloca()) continue; CCValAssign &VA = ArgLocs[I]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[OutIndex]; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: if (Arg.getValueType().isVector() && Arg.getValueType().getVectorElementType() == MVT::i1) Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getBitcast(MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: Arg = DAG.getBitcast(RegVT, Arg); break; case CCValAssign::Indirect: { if (isByVal) { // Memcpy the argument to a temporary stack slot to prevent // the caller from seeing any modifications the callee may make // as guaranteed by the `byval` attribute. int FrameIdx = MF.getFrameInfo().CreateStackObject( Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()), false); SDValue StackSlot = DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); Chain = CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl); // From now on treat this as a regular pointer Arg = StackSlot; isByVal = false; } else { // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast(SpillSlot)->getIndex(); Chain = DAG.getStore( Chain, dl, Arg, SpillSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); Arg = SpillSlot; } break; } } if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); // Split v64i1 value into two registers Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); const TargetOptions &Options = DAG.getTarget().Options; if (Options.EnableDebugEntryValues) CSInfo.emplace_back(VA.getLocReg(), I); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. unsigned ShadowReg = 0; switch (VA.getLocReg()) { case X86::XMM0: ShadowReg = X86::RCX; break; case X86::XMM1: ShadowReg = X86::RDX; break; case X86::XMM2: ShadowReg = X86::R8; break; case X86::XMM3: ShadowReg = X86::R9; break; } if (ShadowReg) RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget.isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair( unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of // the tail jump. This is done to circumvent the ebx/callee-saved problem // for tail calls on PIC/GOT architectures. Normally we would just put the // address of GOT into ebx and then call target@PLT. But for tail calls // ebx would be restored (since ebx is callee saved) before jumping to the // target@PLT. // Note: The actual moving to ECX is done further down. GlobalAddressSDNode *G = dyn_cast(Callee); if (G && !G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility()) Callee = LowerGlobalAddress(Callee, DAG); else if (isa(Callee)) Callee = LowerExternalSymbol(Callee, DAG); } } if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in // the declaration) %al is used as hidden argument to specify the number // of SSE registers used. The contents of %al do not need to match exactly // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); RegsToPass.push_back(std::make_pair(unsigned(X86::AL), DAG.getConstant(NumXMMRegs, dl, MVT::i8))); } if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); } } // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and // the alias isn't otherwise explicit. This is slightly more conservative // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); SmallVector MemOpChains2; SDValue FIN; int FI = 0; for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutsIndex) { CCValAssign &VA = ArgLocs[I]; if (VA.isRegLoc()) { if (VA.needsCustom()) { assert((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"); // This means that we are in special case where one argument was // passed through two register locations - Skip the next location ++I; } continue; } assert(VA.isMemLoc()); SDValue Arg = OutVals[OutsIndex]; ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; // Skip inalloca arguments. They don't require any work. if (Flags.isInAlloca()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, ArgChain, Flags, DAG, dl)); } else { // Store relative to framepointer. MemOpChains2.push_back(DAG.getStore( ArgChain, dl, Arg, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); } } if (!MemOpChains2.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, getPointerTy(DAG.getDataLayout()), RegInfo->getSlotSize(), FPDiff, dl); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into registers. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (DAG.getTarget().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. } else if (Callee->getOpcode() == ISD::GlobalAddress || Callee->getOpcode() == ISD::ExternalSymbol) { // Lower direct calls to global addresses and external symbols. Setting // ForCall to true here has the effect of removing WrapperRIP when possible // to allow direct calls to be selected without first materializing the // address into a register. Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); } else if (Subtarget.isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector Ops; if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } Ops.push_back(Chain); Ops.push_back(Callee); if (isTailCall) Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we // set X86_INTR calling convention because it has the same CSR mask // (same preserved registers). const uint32_t *Mask = RegInfo->getCallPreservedMask( MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv); assert(Mask && "Missing call preserved mask for calling convention"); // If this is an invoke in a 32-bit function using a funclet-based // personality, assume the function clobbers all registers. If an exception // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) { const Function &CallerFn = MF.getFunction(); EHPersonality Pers = CallerFn.hasPersonalityFn() ? classifyEHPersonality(CallerFn.getPersonalityFn()) : EHPersonality::Unknown; if (isFuncletEHPersonality(Pers)) Mask = RegInfo->getNoPreservedMask(); } // Define a new register mask from the existing mask. uint32_t *RegMask = nullptr; // In some calling conventions we need to remove the used physical registers // from the reg mask. if (CallConv == CallingConv::X86_RegCall || HasNCSR) { const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Allocate a new Reg Mask and copy Mask. RegMask = MF.allocateRegMask(); unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize); // Make sure all sub registers of the argument registers are reset // in the RegMask. for (auto const &RegPair : RegsToPass) for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); // Create the RegMask Operand according to our updated mask. Ops.push_back(DAG.getRegisterMask(RegMask)); } else { // Create the RegMask Operand according to the static mask. Ops.push_back(DAG.getRegisterMask(Mask)); } if (InFlag.getNode()) Ops.push_back(InFlag); if (isTailCall) { // We used to do: //// If this is the first return lowered for this function, add the regs //// to the liveout set for the function. // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. MF.getFrameInfo().setHasTailCall(); SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); return Ret; } if (HasNoCfCheck && IsCFProtectionSupported) { Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); } else { Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); } InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); // Save heapallocsite metadata. if (CLI.CS) if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite")) DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget.getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPop = 4; else NumBytesForCalleeToPop = 0; // Callee pops nothing. if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) { // No need to reset the stack after the call if the call doesn't return. To // make the MI verify, we'll pretend the callee does it for us. NumBytesForCalleeToPop = NumBytes; } // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals, RegMask); } //===----------------------------------------------------------------------===// // Fast Calling Convention (tail call) implementation //===----------------------------------------------------------------------===// // Like std call, callee cleans arguments, convention except that ECX is // reserved for storing the tail called function address. Only 2 registers are // free for argument passing (inreg). Tail call optimization is performed // provided: // * tailcallopt is enabled // * caller/callee are fastcc // On X86_64 architecture with GOT-style position independent code only local // (within module) calls are supported at the moment. // To keep the stack aligned according to platform abi the function // GetAlignedArgumentStackSize ensures that argument delta is always multiples // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the // original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 // arg2 // RETADDR // [ new RETADDR // move area ] // (possible EBP) // ESI // EDI // local1 .. /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align /// requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; unsigned SlotSize = RegInfo->getSlotSize(); if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { // Number smaller than 12 so just add the difference. Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); } else { // Mask out lower bits, add stackalignment once plus the 12 bytes. Offset = ((~AlignMask) & Offset) + StackAlignment + (StackAlignment-SlotSize); } return Offset; } /// Return true if the given stack call argument is already available in the /// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII, const CCValAssign &VA) { unsigned Bytes = Arg.getValueSizeInBits() / 8; for (;;) { // Look through nodes that don't alter the bits of the incoming value. unsigned Op = Arg.getOpcode(); if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) { Arg = Arg.getOperand(0); continue; } if (Op == ISD::TRUNCATE) { const SDValue &TruncInput = Arg.getOperand(0); if (TruncInput.getOpcode() == ISD::AssertZext && cast(TruncInput.getOperand(1))->getVT() == Arg.getValueType()) { Arg = TruncInput.getOperand(0); continue; } } break; } int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast(Arg.getOperand(1))->getReg(); if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(*Def, FI)) return false; } else { unsigned Opcode = Def->getOpcode(); if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); Bytes = Flags.getByValSize(); } else return false; } } else if (LoadSDNode *Ld = dyn_cast(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { FrameIndexSDNode *FINode = cast(Arg); FI = FINode->getIndex(); Bytes = Flags.getByValSize(); } else return false; assert(FI != INT_MAX); if (!MFI.isFixedObjectIndex(FI)) return false; if (Offset != MFI.getObjectOffset(FI)) return false; // If this is not byval, check that the argument stack object is immutable. // inalloca and argument copy elision can create mutable argument stack // objects. Byval objects can be mutated, but a byval call intends to pass the // mutated memory. if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) return false; if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) { // If the argument location is wider than the argument type, check that any // extension flags match. if (Flags.isZExt() != MFI.isObjectZExt(FI) || Flags.isSExt() != MFI.isObjectSExt(FI)) { return false; } } return Bytes == MFI.getObjectSize(FI); } /// Check whether the call is eligible for tail call optimization. Targets /// that want to do tail call optimization should implement this function. bool X86TargetLowering::IsEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { if (!mayTailCallThisCC(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to // perform a tailcall optimization here. if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) return false; CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this // space. if (IsCalleeWin64 != IsCallerWin64) return false; if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; } // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); if (RegInfo->needsStackRealignment(MF)) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. LLVMContext &C = *DAG.getContext(); if (isVarArg && !Outs.empty()) { // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) return false; SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) if (!ArgLocs[i].isRegLoc()) return false; } // If the call result is in ST0 / ST1, it needs to be popped off the x87 // stack. Therefore, if it's not used by the call it is not safe to optimize // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { Unused = true; break; } } if (Unused) { SmallVector RVLocs; CCState CCInfo(CalleeCC, false, MF, RVLocs, C); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; } } // Check that the call results are passed in the same way. if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, RetCC_X86, RetCC_X86)) return false; // The callee has to preserve all registers the caller needs to preserve. const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } unsigned StackArgsSize = 0; // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); // Allocate shadow area for Win64 if (IsCalleeWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); StackArgsSize = CCInfo.getNextStackOffset(); if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII, VA)) return false; } } } bool PositionIndependent = isPositionIndependent(); // If the tailcall address may be in a register, then make sure it's // possible to register allocate for it. In 32-bit, the call address can // only target EAX, EDX, or ECX since the tail call must be scheduled after // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. if (!Subtarget.is64Bit() && ((!isa(Callee) && !isa(Callee)) || PositionIndependent)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = PositionIndependent ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; unsigned Reg = VA.getLocReg(); switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: if (++NumInRegs == MaxInRegs) return false; break; } } } const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) return false; } bool CalleeWillPop = X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt); if (unsigned BytesToPop = MF.getInfo()->getBytesToPopOnReturn()) { // If we have bytes to pop, the callee must pop them. bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; if (!CalleePopMatches) return false; } else if (CalleeWillPop && StackArgsSize > 0) { // If we don't have bytes to pop, make sure the callee doesn't pop any. return false; } return true; } FastISel * X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return X86::createFastISel(funcInfo, libInfo); } //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// static bool MayFoldLoad(SDValue Op) { return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); } static bool MayFoldIntoStore(SDValue Op) { return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } static bool MayFoldIntoZeroExtend(SDValue Op) { if (Op.hasOneUse()) { unsigned Opcode = Op.getNode()->use_begin()->getOpcode(); return (ISD::ZERO_EXTEND == Opcode); } return false; } static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: case X86ISD::MOVLHPS: case X86ISD::MOVHLPS: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VBROADCAST: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::SHUF128: case X86ISD::VPERMIL2: case X86ISD::VPERMI: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VZEXT_MOVL: return true; } } static bool isTargetShuffleVariableMask(unsigned Opcode) { switch (Opcode) { default: return false; // Target Shuffles. case X86ISD::PSHUFB: case X86ISD::VPERMILPV: case X86ISD::VPERMIL2: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: return true; // 'Faux' Target Shuffles. case ISD::OR: case ISD::AND: case X86ISD::ANDNP: return true; } } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra // restrictions. if (!hasSymbolicDisplacement) return true; // FIXME: Some tweaks might be needed for medium code model. if (M != CodeModel::Small && M != CodeModel::Kernel) return false; // For small code model we assume that latest object is 16MB before end of 31 // bits boundary. We may also accept pretty large negative constants knowing // that all objects are in the positive half of address space. if (M == CodeModel::Small && Offset < 16*1024*1024) return true; // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. if (M == CodeModel::Kernel && Offset >= 0) return true; return false; } /// Determines whether the callee is required to pop its own arguments. /// Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { // If GuaranteeTCO is true, we force some calls to be callee pop so that we // can guarantee TCO. if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) return true; switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::X86_VectorCall: return !is64Bit; } } /// Return true if the condition is an unsigned comparison operation. static bool isX86CCUnsigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); case X86::COND_E: case X86::COND_NE: case X86::COND_B: case X86::COND_A: case X86::COND_BE: case X86::COND_AE: return true; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: return false; } } static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { switch (SetCCOpcode) { default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; case ISD::SETGE: return X86::COND_GE; case ISD::SETLT: return X86::COND_L; case ISD::SETLE: return X86::COND_LE; case ISD::SETNE: return X86::COND_NE; case ISD::SETULT: return X86::COND_B; case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; } } /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; } if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; } } return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { default: break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: case ISD::SETUGE: std::swap(LHS, RHS); break; } // On a floating point condition, the flags are set as follows: // ZF PF CF op // 0 | 0 | 0 | X > Y // 0 | 0 | 1 | X < Y // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; case ISD::SETOLT: // flipped case ISD::SETOGT: case ISD::SETGT: return X86::COND_A; case ISD::SETOLE: // flipped case ISD::SETOGE: case ISD::SETGE: return X86::COND_AE; case ISD::SETUGT: // flipped case ISD::SETULT: case ISD::SETLT: return X86::COND_B; case ISD::SETUGE: // flipped case ISD::SETULE: case ISD::SETLE: return X86::COND_BE; case ISD::SETONE: case ISD::SETNE: return X86::COND_NE; case ISD::SETUO: return X86::COND_P; case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; } } /// Is there a floating point cmov for the specific X86 condition code? /// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { default: return false; case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_P: case X86::COND_A: case X86::COND_AE: case X86::COND_NE: case X86::COND_NP: return true; } } bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); if (!IntrData) return false; Info.flags = MachineMemOperand::MONone; Info.offset = 0; switch (IntrData->Type) { case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = I.getArgOperand(0); MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; if (IntrData->Type == TRUNCATE_TO_MEM_VI8) ScalarVT = MVT::i8; else if (IntrData->Type == TRUNCATE_TO_MEM_VI16) ScalarVT = MVT::i16; else if (IntrData->Type == TRUNCATE_TO_MEM_VI32) ScalarVT = MVT::i32; Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); Info.align = 1; Info.flags |= MachineMemOperand::MOStore; break; } case GATHER: case GATHER_AVX2: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = 1; Info.flags |= MachineMemOperand::MOLoad; break; } case SCATTER: { Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = 1; Info.flags |= MachineMemOperand::MOStore; break; } default: return false; } return true; } /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; } return false; } bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast(Load)->getBasePtr(); if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; // If this is an (1) AVX vector load with (2) multiple uses and (3) all of // those uses are extracted directly into a store, then the extract + store // can be store-folded. Therefore, it's probably not worth splitting the load. EVT VT = Load->getValueType(0); if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) { // Skip uses of the chain value. Result 0 of the node is the load value. if (UI.getUse().getResNo() != 0) continue; // If this use is not an extract + store, it's probably worth splitting. if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() || UI->use_begin()->getOpcode() != ISD::STORE) return true; } // All non-chain uses are extract + store. return false; } return true; } /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0 || BitSize > 64) return false; return true; } bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const { // If we are using XMM registers in the ABI and the condition of the select is // a floating-point compare and we have blendv or conditional move, then it is // cheaper to select instead of doing a cross-register move and creating a // load that depends on the compare result. return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX(); } bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { // TODO: It might be a win to ease or lift this restriction, but the generic // folds in DAGCombiner conflict with vector folds for an AVX512 target. if (VT.isVector() && Subtarget.hasAVX512()) return false; return true; } bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const { // TODO: We handle scalars using custom code, but generic combining could make // that unnecessary. APInt MulC; if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; // If vector multiply is legal, assume that's faster than shl + add/sub. // TODO: Multiply is a complex op with higher latency and lower througput in // most implementations, so this check could be loosened based on type // and/or a CPU attribute. if (isOperationLegal(ISD::MUL, VT)) return false; // shl+add, shl+sub, shl+add+neg return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() || (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); } bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, bool IsSigned) const { // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available. return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov(); } bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; // Mask vectors support all subregister combinations and operations that // extract half of vector. if (ResVT.getVectorElementType() == MVT::i1) return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) && (Index == ResVT.getVectorNumElements())); return (Index % ResVT.getVectorNumElements()) == 0; } bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { unsigned Opc = VecOp.getOpcode(); // Assume target opcodes can't be scalarized. // TODO - do we have any exceptions? if (Opc >= ISD::BUILTIN_OP_END) return false; // If the vector op is not supported, try to convert to scalar. EVT VecVT = VecOp.getValueType(); if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) return true; // If the vector op is supported, but the scalar op is not, the transform may // not be worthwhile. EVT ScalarVT = VecVT.getScalarType(); return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); } bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const { // TODO: Allow vectors? if (VT.isVector()) return false; return VT.isSimple() || !isOperationExpand(Opcode, VT); } bool X86TargetLowering::isCheapToSpeculateCttz() const { // Speculate cttz only if we can directly use TZCNT. return Subtarget.hasBMI(); } bool X86TargetLowering::isCheapToSpeculateCtlz() const { // Speculate ctlz only if we can directly use LZCNT. return Subtarget.hasLZCNT(); } bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const { if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && BitcastVT.getVectorElementType() == MVT::i1) return false; if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; // If both types are legal vectors, it's always ok to convert them. if (LoadVT.isVector() && BitcastVT.isVector() && isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) return true; return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const { // Do not merge to float value size (128 bytes) if no implicit // float attribute is set. bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (NoFloat) { unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; return (MemVT.getSizeInBits() <= MaxIntSize); } // Make sure we don't merge greater than our preferred vector // width. if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) return false; return true; } bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { return true; } bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { EVT VT = Y.getValueType(); if (VT.isVector()) return false; if (!Subtarget.hasBMI()) return false; // There are only 32-bit and 64-bit forms for 'andn'. if (VT != MVT::i32 && VT != MVT::i64) return false; return !isa(Y); } bool X86TargetLowering::hasAndNot(SDValue Y) const { EVT VT = Y.getValueType(); if (!VT.isVector()) return hasAndNotCompare(Y); // Vector. if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128) return false; if (VT == MVT::v4i32) return true; return Subtarget.hasSSE2(); } bool X86TargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { assert(((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && "Expected shift-shift mask"); EVT VT = N->getValueType(0); if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { // Only fold if the shift values are equal - so it folds to AND. // TODO - we should fold if either is a non-uniform vector but we don't do // the fold for non-splats yet. return N->getOperand(1) == N->getOperand(0).getOperand(1); } return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level); } bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { EVT VT = Y.getValueType(); // For vectors, we don't have a preference, but we probably want a mask. if (VT.isVector()) return false; // 64-bit shifts on 32-bit targets produce really bad bloated code. if (VT == MVT::i64 && !Subtarget.is64Bit()) return false; return true; } +bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG, + SDNode *N) const { + if (DAG.getMachineFunction().getFunction().hasMinSize() && + !Subtarget.isOSWindows()) + return false; + return true; +} + bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const { // Any legal vector type can be splatted more efficiently than // loading/spilling from memory. return isTypeLegal(VT); } MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { MVT VT = MVT::getIntegerVT(NumBits); if (isTypeLegal(VT)) return VT; // PMOVMSKB can handle this. if (NumBits == 128 && isTypeLegal(MVT::v16i8)) return MVT::v16i8; // VPMOVMSKB can handle this. if (NumBits == 256 && isTypeLegal(MVT::v32i8)) return MVT::v32i8; // TODO: Allow 64-bit type for 32-bit target. // TODO: 512-bit types should be allowed, but make sure that those // cases are handled in combineVectorSizedSetCCEquality(). return MVT::INVALID_SIMPLE_VALUE_TYPE; } /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); } /// Val is either the undef or zero sentinel value. static bool isUndefOrZero(int Val) { return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); } /// Return true if every element in Mask, beginning from position Pos and ending /// in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) if (Mask[i] != SM_SentinelUndef) return false; return true; } /// Return true if the mask creates a vector whose lower half is undefined. static bool isUndefLowerHalf(ArrayRef Mask) { unsigned NumElts = Mask.size(); return isUndefInRange(Mask, 0, NumElts / 2); } /// Return true if the mask creates a vector whose upper half is undefined. static bool isUndefUpperHalf(ArrayRef Mask) { unsigned NumElts = Mask.size(); return isUndefInRange(Mask, NumElts / 2, NumElts / 2); } /// Return true if Val falls within the specified range (L, H]. static bool isInRange(int Val, int Low, int Hi) { return (Val >= Low && Val < Hi); } /// Return true if the value of any element in Mask falls within the specified /// range (L, H]. static bool isAnyInRange(ArrayRef Mask, int Low, int Hi) { for (int M : Mask) if (isInRange(M, Low, Hi)) return true; return false; } /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef or if its value /// falls within the specified range (L, H]. static bool isUndefOrInRange(ArrayRef Mask, int Low, int Hi) { for (int M : Mask) if (!isUndefOrInRange(M, Low, Hi)) return false; return true; } /// Return true if Val is undef, zero or if its value falls within the /// specified range (L, H]. static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { return isUndefOrZero(Val) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef, zero or if its value /// falls within the specified range (L, H]. static bool isUndefOrZeroOrInRange(ArrayRef Mask, int Low, int Hi) { for (int M : Mask) if (!isUndefOrZeroOrInRange(M, Low, Hi)) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos + Size, falls within the specified /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef. static bool isSequentialOrUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low, int Step = 1) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size], or is undef or is zero. static bool isSequentialOrUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low) if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef or is zero. static bool isUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) if (!isUndefOrZero(Mask[i])) return false; return true; } /// Helper function to test whether a shuffle mask could be /// simplified by widening the elements being shuffled. /// /// Appends the mask for wider elements in WidenedMask if valid. Otherwise /// leaves it in an unspecified state. /// /// NOTE: This must handle normal vector shuffle masks and *target* vector /// shuffle masks. The latter have the special property of a '-2' representing /// a zero-ed lane of a vector. static bool canWidenShuffleElements(ArrayRef Mask, SmallVectorImpl &WidenedMask) { WidenedMask.assign(Mask.size() / 2, 0); for (int i = 0, Size = Mask.size(); i < Size; i += 2) { int M0 = Mask[i]; int M1 = Mask[i + 1]; // If both elements are undef, its trivial. if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { WidenedMask[i / 2] = SM_SentinelUndef; continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { WidenedMask[i / 2] = M1 / 2; continue; } if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { WidenedMask[i / 2] = M0 / 2; continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { WidenedMask[i / 2] = SM_SentinelZero; continue; } return false; } // Finally check if the two mask values are adjacent and aligned with // a pair. if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { WidenedMask[i / 2] = M0 / 2; continue; } // Otherwise we can't safely widen the elements used in this shuffle. return false; } assert(WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"); return true; } static bool canWidenShuffleElements(ArrayRef Mask, const APInt &Zeroable, SmallVectorImpl &WidenedMask) { SmallVector TargetMask(Mask.begin(), Mask.end()); for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { if (TargetMask[i] == SM_SentinelUndef) continue; if (Zeroable[i]) TargetMask[i] = SM_SentinelZero; } return canWidenShuffleElements(TargetMask, WidenedMask); } static bool canWidenShuffleElements(ArrayRef Mask) { SmallVector WidenedMask; return canWidenShuffleElements(Mask, WidenedMask); } /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); } // Build a vector of constants. // Use an UNDEF node if MaskElt == -1. // Split 64-bit constants in the 32-bit mode. static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask = false) { SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0; i < NumElts; ++i) { bool IsUndef = Values[i] < 0 && IsMask; SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT); Ops.push_back(OpNode); if (Split) Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(0, dl, EltVT)); } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); if (Split) ConstsNode = DAG.getBitcast(VT, ConstsNode); return ConstsNode; } static SDValue getConstVector(ArrayRef Bits, APInt &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"); SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0, e = Bits.size(); i != e; ++i) { if (Undefs[i]) { Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); continue; } const APInt &V = Bits[i]; assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"); if (Split) { Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT)); Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT)); } else if (EltVT == MVT::f32) { APFloat FV(APFloat::IEEEsingle(), V); Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else if (EltVT == MVT::f64) { APFloat FV(APFloat::IEEEdouble(), V); Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else { Ops.push_back(DAG.getConstant(V, dl, EltVT)); } } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); return DAG.getBitcast(VT, ConstsNode); } /// Returns a vector of specified type with all zero elements. static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"); // Try to build SSE/AVX zero vectors as bitcasted to their dest // type. This ensures they get CSE'd. But if the integer type is not // available, use a floating-point +0.0 instead. SDValue Vec; if (!Subtarget.hasSSE2() && VT.is128BitVector()) { Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); Vec = DAG.getConstant(0, dl, VT); } else { unsigned Num32BitElts = VT.getSizeInBits() / 32; Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts)); } return DAG.getBitcast(VT, Vec); } static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector(ResultVT, dl, Vec->ops().slice(IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 256); } static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.isUndef()) return Result; EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } /// Widen a vector to a larger size with the same scalar type, with the new /// elements either zero or undef. static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && "Unsupported vector widening type"); SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl) : DAG.getUNDEF(VT); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec, DAG.getIntPtrConstant(0, dl)); } /// Widen a vector to a larger size with the same scalar type, with the new /// elements either zero or undef. static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl, unsigned WideSizeInBits) { assert(Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"); unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits(); MVT SVT = Vec.getSimpleValueType().getScalarType(); MVT VT = MVT::getVectorVT(SVT, WideNumElts); return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); } // Helper function to collect subvector ops that are concated together, // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. // The subvectors in Ops are guaranteed to be the same type. static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops) { assert(Ops.empty() && "Expected an empty ops vector"); if (N->getOpcode() == ISD::CONCAT_VECTORS) { Ops.append(N->op_begin(), N->op_end()); return true; } if (N->getOpcode() == ISD::INSERT_SUBVECTOR && isa(N->getOperand(2))) { SDValue Src = N->getOperand(0); SDValue Sub = N->getOperand(1); const APInt &Idx = N->getConstantOperandAPInt(2); EVT VT = Src.getValueType(); EVT SubVT = Sub.getValueType(); // TODO - Handle more general insert_subvector chains. if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && Idx == (VT.getVectorNumElements() / 2) && Src.getOpcode() == ISD::INSERT_SUBVECTOR && Src.getOperand(1).getValueType() == SubVT && isNullConstant(Src.getOperand(2))) { Ops.push_back(Src.getOperand(1)); Ops.push_back(Sub); return true; } } return false; } // Helper for splitting operands of an operation to legal target size and // apply a function on each part. // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for // deciding if/how to split Ops. Ops elements do *not* have to be of type VT. // The argument Builder is a function that will be applied on each split part: // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef) template SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef Ops, F Builder, bool CheckBWI = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; if ((CheckBWI && Subtarget.useBWIRegs()) || (!CheckBWI && Subtarget.useAVX512Regs())) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); } } else if (Subtarget.hasAVX2()) { if (VT.getSizeInBits() > 256) { NumSubs = VT.getSizeInBits() / 256; assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"); } } else { if (VT.getSizeInBits() > 128) { NumSubs = VT.getSizeInBits() / 128; assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"); } } if (NumSubs == 1) return Builder(DAG, DL, Ops); SmallVector Subs; for (unsigned i = 0; i != NumSubs; ++i) { SmallVector SubOps; for (SDValue Op : Ops) { EVT OpVT = Op.getValueType(); unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs; unsigned SizeSub = OpVT.getSizeInBits() / NumSubs; SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub)); } Subs.push_back(Builder(DAG, DL, SubOps)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); } /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); if (!isa(Idx)) return SDValue(); // Inserting undef is a nop. We can just return the original vector. if (SubVec.isUndef()) return Vec; unsigned IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; MVT OpVT = Op.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); // Extend to natively supported kshift. MVT WideOpVT = OpVT; if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts // if necessary. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { // May need to promote to a legal type. Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), SubVec, Idx); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } MVT SubVecVT = SubVec.getSimpleValueType(); unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); assert(IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); SDValue Undef = DAG.getUNDEF(WideOpVT); if (IdxVal == 0) { // Zero lower bits of the Vec SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), SubVec, ZeroIdx); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, SubVec, ZeroIdx); if (Vec.isUndef()) { assert(IdxVal != 0 && "Unexpected index"); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } if (ISD::isBuildVectorAllZeros(Vec.getNode())) { assert(IdxVal != 0 && "Unexpected index"); NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getConstant(ShiftLeft, dl, MVT::i8)); if (ShiftRight != 0) SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getConstant(ShiftRight, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); if (SubVecNumElems * 2 == NumElems) { // Special case, use legal zero extending insert_subvector. This allows // isel to opimitize when bits are known zero. Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), Vec, ZeroIdx); } else { // Otherwise use explicit shifts to zero the bits. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); NumElems = WideOpVT.getVectorNumElements(); SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); } Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } // Inserting into the middle is more complicated. NumElems = WideOpVT.getVectorNumElements(); // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); // Move the current value of the bit to be replace to the lsbs. Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, DAG.getConstant(IdxVal, dl, MVT::i8)); // Xor with the new bit. Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); // Shift to MSB, filling bottom bits with 0. unsigned ShiftLeft = NumElems - SubVecNumElems; Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, DAG.getConstant(ShiftLeft, dl, MVT::i8)); // Shift to the final position, filling upper bits with 0. unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, DAG.getConstant(ShiftRight, dl, MVT::i8)); // Xor with original vector leaving the new value. Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); // Reduce to original width if needed. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, const SDLoc &dl, unsigned VectorWidth) { SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth); return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth); } /// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"); APInt Ones = APInt::getAllOnesValue(32); unsigned NumElts = VT.getSizeInBits() / 32; SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); return DAG.getBitcast(VT, Vec); } // Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode. static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) { switch (Opcode) { case ISD::ANY_EXTEND: case ISD::ANY_EXTEND_VECTOR_INREG: return ISD::ANY_EXTEND_VECTOR_INREG; case ISD::ZERO_EXTEND: case ISD::ZERO_EXTEND_VECTOR_INREG: return ISD::ZERO_EXTEND_VECTOR_INREG; case ISD::SIGN_EXTEND: case ISD::SIGN_EXTEND_VECTOR_INREG: return ISD::SIGN_EXTEND_VECTOR_INREG; } llvm_unreachable("Unknown opcode"); } static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. if (InVT.getSizeInBits() > 128) { assert(VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"); unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); In = extractSubVector(In, 0, DAG, DL, std::max(128U, VT.getSizeInBits() / Scale)); InVT = In.getValueType(); } if (VT.getVectorNumElements() != InVT.getVectorNumElements()) Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); return DAG.getNode(Opcode, DL, VT, In); } /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Return a vector_shuffle of the specified vector of zero or undef vector. /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); int NumElems = VT.getVectorNumElements(); SmallVector MaskVec(NumElems); for (int i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec[i] = (i == Idx) ? NumElems : i; return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); } static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { if (!Load || !ISD::isNormalLoad(Load)) return nullptr; SDValue Ptr = Load->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *CNode = dyn_cast(Ptr); if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0) return nullptr; return CNode->getConstVal(); } static const Constant *getTargetConstantFromNode(SDValue Op) { Op = peekThroughBitcasts(Op); return getTargetConstantFromNode(dyn_cast(Op)); } const Constant * X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { assert(LD && "Unexpected null LoadSDNode"); return getTargetConstantFromNode(LD); } // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl &EltBits, bool AllowWholeUndefs = true, bool AllowPartialUndefs = true) { assert(EltBits.empty() && "Expected an empty EltBits vector"); Op = peekThroughBitcasts(Op); EVT VT = Op.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; // Bitcast a source array of element bits to the target size. auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef SrcEltBits) { unsigned NumSrcElts = UndefSrcElts.getBitWidth(); unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth(); assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"); // Don't split if we don't allow undef bits. bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; if (UndefSrcElts.getBoolValue() && !AllowUndefs) return false; // If we're already the right size, don't bother bitcasting. if (NumSrcElts == NumElts) { UndefElts = UndefSrcElts; EltBits.assign(SrcEltBits.begin(), SrcEltBits.end()); return true; } // Extract all the undef/constant element data and pack into single bitsets. APInt UndefBits(SizeInBits, 0); APInt MaskBits(SizeInBits, 0); for (unsigned i = 0; i != NumSrcElts; ++i) { unsigned BitOffset = i * SrcEltSizeInBits; if (UndefSrcElts[i]) UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); MaskBits.insertBits(SrcEltBits[i], BitOffset); } // Split the undef/constant single bitset data into the target elements. UndefElts = APInt(NumElts, 0); EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); for (unsigned i = 0; i != NumElts; ++i) { unsigned BitOffset = i * EltSizeInBits; APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); // Only treat an element as UNDEF if all bits are UNDEF. if (UndefEltBits.isAllOnesValue()) { if (!AllowWholeUndefs) return false; UndefElts.setBit(i); continue; } // If only some bits are UNDEF then treat them as zero (or bail if not // supported). if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) return false; EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset); } return true; }; // Collect constant bits and insert into mask/undef bit masks. auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, unsigned UndefBitIndex) { if (!Cst) return false; if (isa(Cst)) { Undefs.setBit(UndefBitIndex); return true; } if (auto *CInt = dyn_cast(Cst)) { Mask = CInt->getValue(); return true; } if (auto *CFP = dyn_cast(Cst)) { Mask = CFP->getValueAPF().bitcastToAPInt(); return true; } return false; }; // Handle UNDEFs. if (Op.isUndef()) { APInt UndefSrcElts = APInt::getAllOnesValue(NumElts); SmallVector SrcEltBits(NumElts, APInt(EltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract scalar constant bits. if (auto *Cst = dyn_cast(Op)) { APInt UndefSrcElts = APInt::getNullValue(1); SmallVector SrcEltBits(1, Cst->getAPIntValue()); return CastBitData(UndefSrcElts, SrcEltBits); } if (auto *Cst = dyn_cast(Op)) { APInt UndefSrcElts = APInt::getNullValue(1); APInt RawBits = Cst->getValueAPF().bitcastToAPInt(); SmallVector SrcEltBits(1, RawBits); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from build vector. if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { const SDValue &Src = Op.getOperand(i); if (Src.isUndef()) { UndefSrcElts.setBit(i); continue; } auto *Cst = cast(Src); SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits); } return CastBitData(UndefSrcElts, SrcEltBits); } if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { const SDValue &Src = Op.getOperand(i); if (Src.isUndef()) { UndefSrcElts.setBit(i); continue; } auto *Cst = cast(Src); APInt RawBits = Cst->getValueAPF().bitcastToAPInt(); SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits); } return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from constant pool vector. if (auto *Cst = getTargetConstantFromNode(Op)) { Type *CstTy = Cst->getType(); unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0) return false; unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0; i != NumSrcElts; ++i) if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i], UndefSrcElts, i)) return false; return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from a broadcasted constant pool scalar. if (Op.getOpcode() == X86ISD::VBROADCAST && EltSizeInBits <= VT.getScalarSizeInBits()) { if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) { unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) { if (UndefSrcElts[0]) UndefSrcElts.setBits(0, NumSrcElts); SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); return CastBitData(UndefSrcElts, SrcEltBits); } } } // Extract constant bits from a subvector broadcast. if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { SmallVector SubEltBits; if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts, SubEltBits, AllowWholeUndefs, AllowPartialUndefs)) { UndefElts = APInt::getSplat(NumElts, UndefElts); while (EltBits.size() < NumElts) EltBits.append(SubEltBits.begin(), SubEltBits.end()); return true; } } // Extract a rematerialized scalar constant insertion. if (Op.getOpcode() == X86ISD::VZEXT_MOVL && Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && isa(Op.getOperand(0).getOperand(0))) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits; auto *CN = cast(Op.getOperand(0).getOperand(0)); SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits)); SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } // Insert constant bits from a base and sub vector sources. if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && isa(Op.getOperand(2))) { // TODO - support insert_subvector through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; APInt UndefSubElts; SmallVector EltSubBits; if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, UndefSubElts, EltSubBits, AllowWholeUndefs, AllowPartialUndefs) && getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts, EltBits, AllowWholeUndefs, AllowPartialUndefs)) { unsigned BaseIdx = Op.getConstantOperandVal(2); UndefElts.insertBits(UndefSubElts, BaseIdx); for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) EltBits[BaseIdx + i] = EltSubBits[i]; return true; } } // Extract constant bits from a subvector's source. if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && isa(Op.getOperand(1))) { // TODO - support extract_subvector through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts, EltBits, AllowWholeUndefs, AllowPartialUndefs)) { EVT SrcVT = Op.getOperand(0).getValueType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumSubElts = VT.getVectorNumElements(); unsigned BaseIdx = Op.getConstantOperandVal(1); UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx); if ((BaseIdx + NumSubElts) != NumSrcElts) EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end()); if (BaseIdx != 0) EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx); return true; } } // Extract constant bits from shuffle node sources. if (auto *SVN = dyn_cast(Op)) { // TODO - support shuffle through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; ArrayRef Mask = SVN->getMask(); if ((!AllowWholeUndefs || !AllowPartialUndefs) && llvm::any_of(Mask, [](int M) { return M < 0; })) return false; APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; if (isAnyInRange(Mask, 0, NumElts) && !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts0, EltBits0, AllowWholeUndefs, AllowPartialUndefs)) return false; if (isAnyInRange(Mask, NumElts, 2 * NumElts) && !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, UndefElts1, EltBits1, AllowWholeUndefs, AllowPartialUndefs)) return false; UndefElts = APInt::getNullValue(NumElts); for (int i = 0; i != (int)NumElts; ++i) { int M = Mask[i]; if (M < 0) { UndefElts.setBit(i); EltBits.push_back(APInt::getNullValue(EltSizeInBits)); } else if (M < (int)NumElts) { if (UndefElts0[M]) UndefElts.setBit(i); EltBits.push_back(EltBits0[M]); } else { if (UndefElts1[M - NumElts]) UndefElts.setBit(i); EltBits.push_back(EltBits1[M - NumElts]); } } return true; } return false; } static bool isConstantSplat(SDValue Op, APInt &SplatVal) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits, true, false)) { int SplatIndex = -1; for (int i = 0, e = EltBits.size(); i != e; ++i) { if (UndefElts[i]) continue; if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) { SplatIndex = -1; break; } SplatIndex = i; } if (0 <= SplatIndex) { SplatVal = EltBits[SplatIndex]; return true; } } return false; } static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl &RawMask, APInt &UndefElts) { // Extract the raw target constant bits. SmallVector EltBits; if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, EltBits, /* AllowWholeUndefs */ true, /* AllowPartialUndefs */ false)) return false; // Insert the extracted elements into the mask. for (APInt Elt : EltBits) RawMask.push_back(Elt.getZExtValue()); return true; } /// Create a shuffle mask that matches the PACKSS/PACKUS truncation. /// Note: This ignores saturation, so inputs must be checked first. static void createPackShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Unary) { assert(Mask.empty() && "Expected an empty shuffle mask vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits(); unsigned Offset = Unary ? 0 : NumElts; for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) Mask.push_back(Elt + (Lane * NumEltsPerLane)); for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); } } // Split the demanded elts of a PACKSS/PACKUS node between its operands. static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS) { int NumLanes = VT.getSizeInBits() / 128; int NumElts = DemandedElts.getBitWidth(); int NumInnerElts = NumElts / 2; int NumEltsPerLane = NumElts / NumLanes; int NumInnerEltsPerLane = NumInnerElts / NumLanes; DemandedLHS = APInt::getNullValue(NumInnerElts); DemandedRHS = APInt::getNullValue(NumInnerElts); // Map DemandedElts to the packed operands. for (int Lane = 0; Lane != NumLanes; ++Lane) { for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) { int OuterIdx = (Lane * NumEltsPerLane) + Elt; int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt; if (DemandedElts[OuterIdx]) DemandedLHS.setBit(InnerIdx); if (DemandedElts[OuterIdx + NumInnerEltsPerLane]) DemandedRHS.setBit(InnerIdx); } } } // Split the demanded elts of a HADD/HSUB node between its operands. static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS) { int NumLanes = VT.getSizeInBits() / 128; int NumElts = DemandedElts.getBitWidth(); int NumEltsPerLane = NumElts / NumLanes; int HalfEltsPerLane = NumEltsPerLane / 2; DemandedLHS = APInt::getNullValue(NumElts); DemandedRHS = APInt::getNullValue(NumElts); // Map DemandedElts to the horizontal operands. for (int Idx = 0; Idx != NumElts; ++Idx) { if (!DemandedElts[Idx]) continue; int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane; int LocalIdx = Idx % NumEltsPerLane; if (LocalIdx < HalfEltsPerLane) { DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0); DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1); } else { LocalIdx -= HalfEltsPerLane; DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0); DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1); } } } /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. /// Sets \p IsUnary to true if only one source is used. Note that this will set /// IsUnary for shuffles which use a single input multiple times, and in those /// cases it will adjust the mask to only have indices within that single input. /// It is an error to call this with non-empty Mask/Ops vectors. static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, SmallVectorImpl &Ops, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); unsigned MaskEltSize = VT.getScalarSizeInBits(); SmallVector RawMask; APInt RawUndefs; SDValue ImmN; assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); IsUnary = false; bool IsFakeUnary = false; switch (N->getOpcode()) { case X86ISD::BLENDI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeBLENDMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUFP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeSHUFPMask(NumElems, MaskEltSize, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::INSERTPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeINSERTPSMask(cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::EXTRQI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); if (isa(N->getOperand(1)) && isa(N->getOperand(2))) { int BitLen = N->getConstantOperandVal(1); int BitIdx = N->getConstantOperandVal(2); DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); IsUnary = true; } break; case X86ISD::INSERTQI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); if (isa(N->getOperand(2)) && isa(N->getOperand(3))) { int BitLen = N->getConstantOperandVal(2); int BitIdx = N->getConstantOperandVal(3); DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); } break; case X86ISD::UNPCKH: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKHMask(NumElems, MaskEltSize, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKLMask(NumElems, MaskEltSize, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePALIGNRMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); Ops.push_back(N->getOperand(1)); Ops.push_back(N->getOperand(0)); break; case X86ISD::VSHLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSLLDQMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::VSRLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSRLDQMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSHUFMask(NumElems, MaskEltSize, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFHW: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSHUFHWMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSHUFLWMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::VZEXT_MOVL: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeZeroMoveLowMask(NumElems, Mask); IsUnary = true; break; case X86ISD::VBROADCAST: { SDValue N0 = N->getOperand(0); // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so, // add the pre-extracted value to the Ops vector. if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.getOperand(0).getValueType() == VT && N0.getConstantOperandVal(1) == 0) Ops.push_back(N0.getOperand(0)); // We only decode broadcasts of same-sized vectors, unless the broadcast // came from an extract from the original width. If we found one, we // pushed it the Ops vector above. if (N0.getValueType() == VT || !Ops.empty()) { DecodeVectorBroadcast(NumElems, Mask); IsUnary = true; break; } return false; } case X86ISD::VPERMILPV: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N->getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::PSHUFB: { assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N->getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { DecodePSHUFBMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeVPERMMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask); break; case X86ISD::VPERM2X128: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeVPERM2X128Mask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUF128: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSLDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::MOVSHDUP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSHDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::MOVDDUP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVDDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::VPERMIL2: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); SDValue MaskNode = N->getOperand(2); SDValue CtrlNode = N->getOperand(3); if (ConstantSDNode *CtrlOp = dyn_cast(CtrlNode)) { unsigned CtrlImm = CtrlOp->getZExtValue(); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs, Mask); break; } } return false; } case X86ISD::VPPERM: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); SDValue MaskNode = N->getOperand(2); if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { DecodeVPPERMMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMV: { assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. Ops.push_back(N->getOperand(1)); SDValue MaskNode = N->getOperand(0); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMVMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMV3: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(2).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2); // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. Ops.push_back(N->getOperand(0)); Ops.push_back(N->getOperand(2)); SDValue MaskNode = N->getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMV3Mask(RawMask, RawUndefs, Mask); break; } return false; } default: llvm_unreachable("unknown target shuffle node"); } // Empty mask indicates the decode failed. if (Mask.empty()) return false; // Check if we're getting a shuffle mask with zero'd elements. if (!AllowSentinelZero) if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return false; // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. if (IsFakeUnary) for (int &M : Mask) if (M >= (int)Mask.size()) M -= Mask.size(); // If we didn't already add operands in the opcode-specific code, default to // adding 1 or 2 operands starting at 0. if (Ops.empty()) { Ops.push_back(N->getOperand(0)); if (!IsUnary || IsFakeUnary) Ops.push_back(N->getOperand(1)); } return true; } /// Check a target shuffle mask's inputs to see if we can set any values to /// SM_SentinelZero - this is for elements that are known to be zero /// (not just zeroable) from their inputs. /// Returns true if the target shuffle mask was decoded. static bool setTargetShuffleZeroElements(SDValue N, SmallVectorImpl &Mask, SmallVectorImpl &Ops) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; MVT VT = N.getSimpleValueType(); if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary)) return false; SDValue V1 = Ops[0]; SDValue V2 = IsUnary ? V1 : Ops[1]; V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); assert((VT.getSizeInBits() % Mask.size()) == 0 && "Illegal split of shuffle value type"); unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size(); // Extract known constant input data. APInt UndefSrcElts[2]; SmallVector SrcEltBits[2]; bool IsSrcConstant[2] = { getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], SrcEltBits[0], true, false), getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], SrcEltBits[1], true, false)}; for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; // Already decoded as SM_SentinelZero / SM_SentinelUndef. if (M < 0) continue; // Determine shuffle input and normalize the mask. unsigned SrcIdx = M / Size; SDValue V = M < Size ? V1 : V2; M %= Size; // We are referencing an UNDEF input. if (V.isUndef()) { Mask[i] = SM_SentinelUndef; continue; } // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. // TODO: We currently only set UNDEF for integer types - floats use the same // registers as vectors and many of the scalar folded loads rely on the // SCALAR_TO_VECTOR pattern. if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && (Size % V.getValueType().getVectorNumElements()) == 0) { int Scale = Size / V.getValueType().getVectorNumElements(); int Idx = M / Scale; if (Idx != 0 && !VT.isFloatingPoint()) Mask[i] = SM_SentinelUndef; else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) Mask[i] = SM_SentinelZero; continue; } // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) Mask[i] = SM_SentinelUndef; else if (SrcEltBits[SrcIdx][M] == 0) Mask[i] = SM_SentinelZero; } } assert(VT.getVectorNumElements() == Mask.size() && "Different mask size from vector size!"); return true; } // Forward declaration (for getFauxShuffleMask recursive check). static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, SelectionDAG &DAG); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the // destination value type. static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl &Mask, SmallVectorImpl &Ops, SelectionDAG &DAG) { Mask.clear(); Ops.clear(); MVT VT = N.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); unsigned NumSizeInBits = VT.getSizeInBits(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) return false; assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"); unsigned Opcode = N.getOpcode(); switch (Opcode) { case ISD::VECTOR_SHUFFLE: { // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here. ArrayRef ShuffleMask = cast(N)->getMask(); if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) { Mask.append(ShuffleMask.begin(), ShuffleMask.end()); Ops.push_back(N.getOperand(0)); Ops.push_back(N.getOperand(1)); return true; } return false; } case ISD::AND: case X86ISD::ANDNP: { // Attempt to decode as a per-byte mask. APInt UndefElts; SmallVector EltBits; SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); bool IsAndN = (X86ISD::ANDNP == Opcode); uint64_t ZeroMask = IsAndN ? 255 : 0; if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) return false; for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { if (UndefElts[i]) { Mask.push_back(SM_SentinelUndef); continue; } uint64_t ByteBits = EltBits[i].getZExtValue(); if (ByteBits != 0 && ByteBits != 255) return false; Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); } Ops.push_back(IsAndN ? N1 : N0); return true; } case ISD::OR: { // Inspect each operand at the byte level. We can merge these into a // blend shuffle mask if for each byte at least one is masked out (zero). KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts); KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts); if (Known0.One.isNullValue() && Known1.One.isNullValue()) { bool IsByteMask = true; unsigned NumSizeInBytes = NumSizeInBits / 8; unsigned NumBytesPerElt = NumBitsPerElt / 8; APInt ZeroMask = APInt::getNullValue(NumBytesPerElt); APInt SelectMask = APInt::getNullValue(NumBytesPerElt); for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) { unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue(); unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue(); if (LHS == 255 && RHS == 0) SelectMask.setBit(i); else if (LHS == 255 && RHS == 255) ZeroMask.setBit(i); else if (!(LHS == 0 && RHS == 255)) IsByteMask = false; } if (IsByteMask) { for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) { for (unsigned j = 0; j != NumBytesPerElt; ++j) { unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0); int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs)); Mask.push_back(Idx); } } Ops.push_back(N.getOperand(0)); Ops.push_back(N.getOperand(1)); return true; } } // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1)); if (!N0.getValueType().isVector() || !N1.getValueType().isVector()) return false; SmallVector SrcMask0, SrcMask1; SmallVector SrcInputs0, SrcInputs1; if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) || !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG)) return false; int MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector Mask0, Mask1; scaleShuffleMask(MaskSize / SrcMask0.size(), SrcMask0, Mask0); scaleShuffleMask(MaskSize / SrcMask1.size(), SrcMask1, Mask1); for (int i = 0; i != MaskSize; ++i) { if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) Mask.push_back(SM_SentinelUndef); else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) Mask.push_back(SM_SentinelZero); else if (Mask1[i] == SM_SentinelZero) Mask.push_back(Mask0[i]); else if (Mask0[i] == SM_SentinelZero) Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size())); else return false; } for (SDValue &Op : SrcInputs0) Ops.push_back(Op); for (SDValue &Op : SrcInputs1) Ops.push_back(Op); return true; } case ISD::INSERT_SUBVECTOR: { SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); EVT SubVT = Sub.getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); if (!isa(N.getOperand(2)) || !N->isOnlyUserOf(Sub.getNode())) return false; uint64_t InsertIdx = N.getConstantOperandVal(2); // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && Sub.getOperand(0).getValueType() == VT && isa(Sub.getOperand(1))) { uint64_t ExtractIdx = Sub.getConstantOperandVal(1); for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) Mask[InsertIdx + i] = NumElts + ExtractIdx + i; Ops.push_back(Src); Ops.push_back(Sub.getOperand(0)); return true; } // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector SubMask; SmallVector SubInputs; if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, SubMask, DAG)) return false; if (SubMask.size() != NumSubElts) { assert(((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); if ((NumSubElts % SubMask.size()) == 0) { int Scale = NumSubElts / SubMask.size(); SmallVector ScaledSubMask; scaleShuffleMask(Scale, SubMask, ScaledSubMask); SubMask = ScaledSubMask; } else { int Scale = SubMask.size() / NumSubElts; NumSubElts = SubMask.size(); NumElts *= Scale; InsertIdx *= Scale; } } Ops.push_back(Src); for (SDValue &SubInput : SubInputs) { EVT SubSVT = SubInput.getValueType().getScalarType(); EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT, NumSizeInBits / SubSVT.getSizeInBits()); Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT, DAG.getUNDEF(AltVT), SubInput, DAG.getIntPtrConstant(0, SDLoc(N)))); } for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) { int M = SubMask[i]; if (0 <= M) { int InputIdx = M / NumSubElts; M = (NumElts * (1 + InputIdx)) + (M % NumSubElts); } Mask[i + InsertIdx] = M; } return true; } case ISD::SCALAR_TO_VECTOR: { // Match against a scalar_to_vector of an extract from a vector, // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar. SDValue N0 = N.getOperand(0); SDValue SrcExtract; if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && N0.getOperand(0).getValueType() == VT) || (N0.getOpcode() == X86ISD::PEXTRW && N0.getOperand(0).getValueType() == MVT::v8i16) || (N0.getOpcode() == X86ISD::PEXTRB && N0.getOperand(0).getValueType() == MVT::v16i8)) { SrcExtract = N0; } if (!SrcExtract || !isa(SrcExtract.getOperand(1))) return false; SDValue SrcVec = SrcExtract.getOperand(0); EVT SrcVT = SrcVec.getValueType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1; unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); if (NumSrcElts <= SrcIdx) return false; Ops.push_back(SrcVec); Mask.push_back(SrcIdx); Mask.append(NumZeros, SM_SentinelZero); Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef); return true; } case X86ISD::PINSRB: case X86ISD::PINSRW: { SDValue InVec = N.getOperand(0); SDValue InScl = N.getOperand(1); SDValue InIndex = N.getOperand(2); if (!isa(InIndex) || cast(InIndex)->getAPIntValue().uge(NumElts)) return false; uint64_t InIdx = N.getConstantOperandVal(2); // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern. if (X86::isZeroNode(InScl)) { Ops.push_back(InVec); for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i); return true; } // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern. // TODO: Expand this to support INSERT_VECTOR_ELT/etc. unsigned ExOp = (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW); if (InScl.getOpcode() != ExOp) return false; SDValue ExVec = InScl.getOperand(0); SDValue ExIndex = InScl.getOperand(1); if (!isa(ExIndex) || cast(ExIndex)->getAPIntValue().uge(NumElts)) return false; uint64_t ExIdx = InScl.getConstantOperandVal(1); Ops.push_back(InVec); Ops.push_back(ExVec); for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(i == InIdx ? NumElts + ExIdx : i); return true; } case X86ISD::PACKSS: case X86ISD::PACKUS: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && "Unexpected input value type"); APInt EltsLHS, EltsRHS; getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); // If we know input saturation won't happen we can treat this // as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) || (!N1.isUndef() && DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) || (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS))) return false; } bool IsUnary = (N0 == N1); Ops.push_back(N0); if (!IsUnary) Ops.push_back(N1); createPackShuffleMask(VT, Mask, IsUnary); return true; } case X86ISD::VSHLI: case X86ISD::VSRLI: { uint64_t ShiftVal = N.getConstantOperandVal(1); // Out of range bit shifts are guaranteed to be zero. if (NumBitsPerElt <= ShiftVal) { Mask.append(NumElts, SM_SentinelZero); return true; } // We can only decode 'whole byte' bit shifts as shuffles. if ((ShiftVal % 8) != 0) break; uint64_t ByteShift = ShiftVal / 8; unsigned NumBytes = NumSizeInBits / 8; unsigned NumBytesPerElt = NumBitsPerElt / 8; Ops.push_back(N.getOperand(0)); // Clear mask to all zeros and insert the shifted byte indices. Mask.append(NumBytes, SM_SentinelZero); if (X86ISD::VSHLI == Opcode) { for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j] = i + j - ByteShift; } else { for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j - ByteShift] = i + j; } return true; } case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); if (!SrcVT.isVector()) return false; if (NumSizeInBits != SrcVT.getSizeInBits()) { assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal broadcast type"); SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), NumSizeInBits / SrcVT.getScalarSizeInBits()); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, DAG.getUNDEF(SrcVT), Src, DAG.getIntPtrConstant(0, SDLoc(N))); } Ops.push_back(Src); Mask.append(NumElts, 0); return true; } case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::ANY_EXTEND_VECTOR_INREG: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); // Extended source must be a simple vector. if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || (SrcVT.getScalarSizeInBits() % 8) != 0) return false; unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits(); bool IsAnyExtend = (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend, Mask); if (NumSizeInBits != SrcVT.getSizeInBits()) { assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal zero-extension type"); SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(), NumSizeInBits / NumSrcBitsPerElt); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, DAG.getUNDEF(SrcVT), Src, DAG.getIntPtrConstant(0, SDLoc(N))); } Ops.push_back(Src); return true; } } return false; } /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask. static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, SmallVectorImpl &Mask) { int MaskWidth = Mask.size(); SmallVector UsedInputs; for (int i = 0, e = Inputs.size(); i < e; ++i) { int lo = UsedInputs.size() * MaskWidth; int hi = lo + MaskWidth; // Strip UNDEF input usage. if (Inputs[i].isUndef()) for (int &M : Mask) if ((lo <= M) && (M < hi)) M = SM_SentinelUndef; // Check for unused inputs. if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { for (int &M : Mask) if (lo <= M) M -= MaskWidth; continue; } // Check for repeated inputs. bool IsRepeat = false; for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) { if (UsedInputs[j] != Inputs[i]) continue; for (int &M : Mask) if (lo <= M) M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth); IsRepeat = true; break; } if (IsRepeat) continue; UsedInputs.push_back(Inputs[i]); } Inputs = UsedInputs; } /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the /// remaining input indices in case we now have a unary shuffle and adjust the /// inputs accordingly. /// Returns true if the target shuffle mask was decoded. static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, SelectionDAG &DAG) { unsigned NumElts = Op.getValueType().getVectorNumElements(); APInt DemandedElts = APInt::getAllOnesValue(NumElts); if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG)) return false; resolveTargetShuffleInputsAndMask(Inputs, Mask); return true; } /// Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. SDValue V = SDValue(N, 0); EVT VT = V.getValueType(); unsigned Opcode = V.getOpcode(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast(N)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); unsigned NumElems = VT.getVectorNumElements(); SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); MVT ShufSVT = ShufVT.getVectorElementType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; SmallVector ShuffleOps; bool IsUnary; if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt == SM_SentinelZero) return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT) : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT); if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufSVT); assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into insert_subvector base/sub vector to find scalars. if (Opcode == ISD::INSERT_SUBVECTOR && isa(N->getOperand(2))) { SDValue Vec = N->getOperand(0); SDValue Sub = N->getOperand(1); EVT SubVT = Sub.getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); uint64_t SubIdx = N->getConstantOperandVal(2); if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1); return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1); } // Recurse into extract_subvector src vector to find scalars. if (Opcode == ISD::EXTRACT_SUBVECTOR && isa(N->getOperand(1))) { SDValue Src = N->getOperand(0); uint64_t SrcIdx = N->getConstantOperandVal(1); return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1); } // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); unsigned NumElems = VT.getVectorNumElements(); if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) return SDValue(); } if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? V.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Index); return SDValue(); } // Use PINSRB/PINSRW/PINSRD to create a build vector. static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && "Illegal vector insertion"); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < NumElts; ++i) { bool IsNonZero = (NonZeros & (1 << i)) != 0; if (!IsNonZero) continue; // If the build vector contains zeros or our first insertion is not the // first index then insert into zero vector to break any register // dependency else use SCALAR_TO_VECTOR. if (First) { First = false; if (NumZero || 0 != i) V = getZeroVector(VT, Subtarget, DAG, dl); else { assert(0 == i && "Expected insertion into zero-index"); V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); V = DAG.getBitcast(VT, V); continue; } } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return V; } /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 8 && !Subtarget.hasSSE41()) return SDValue(); // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget.hasSSE41()) return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget); SDLoc dl(Op); SDValue V; // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; i += 2) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0; if (!ThisIsNonZero && !NextIsNonZero) continue; // FIXME: Investigate combining the first 4 bytes as a i32 instead. SDValue Elt; if (ThisIsNonZero) { if (NumZero || NextIsNonZero) Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32); else Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); } if (NextIsNonZero) { SDValue NextElt = Op.getOperand(i + 1); if (i == 0 && NumZero) NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32); else NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32); NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt, DAG.getConstant(8, dl, MVT::i8)); if (ThisIsNonZero) Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt); else Elt = NextElt; } // If our first insertion is not the first index then insert into zero // vector to break any register dependency else use SCALAR_TO_VECTOR. if (!V) { if (i != 0) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else { V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt); V = DAG.getBitcast(MVT::v8i16, V); continue; } } Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt, DAG.getIntPtrConstant(i / 2, dl)); } return DAG.getBitcast(MVT::v16i8, V); } /// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 4 && !Subtarget.hasSSE41()) return SDValue(); // Use PINSRW to insert each byte directly. return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget); } /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If this is a splat of a pair of elements, use MOVDDUP (unless the target // has XOP; in that case defer lowering to potentially use VPERMIL2PS). // Because we're creating a less complicated build vector here, we may enable // further folding of the MOVDDUP via shuffle transforms. if (Subtarget.hasSSE3() && !Subtarget.hasXOP() && Op.getOperand(0) == Op.getOperand(2) && Op.getOperand(1) == Op.getOperand(3) && Op.getOperand(0) != Op.getOperand(1)) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); // Create a new build vector with the first 2 elements followed by undef // padding, bitcast to v2f64, duplicate, and bitcast back. SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops)); SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV); return DAG.getBitcast(VT, Dup); } // Find all zeroable elements. std::bitset<4> Zeroable, Undefs; for (int i = 0; i < 4; ++i) { SDValue Elt = Op.getOperand(i); Undefs[i] = Elt.isUndef(); Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; for (unsigned i = 0; i < 4; ++i) { if (Zeroable[i]) continue; SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Elt.getOperand(1))) return SDValue(); // Make sure that this node is extracting from a 128-bit vector. MVT VT = Elt.getOperand(0).getSimpleValueType(); if (!VT.is128BitVector()) return SDValue(); if (!FirstNonZero.getNode()) { FirstNonZero = Elt; FirstNonZeroIdx = i; } } assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); SDValue V1 = FirstNonZero.getOperand(0); MVT VT = V1.getSimpleValueType(); // See if this build_vector can be lowered as a blend with zero. SDValue Elt; unsigned EltMaskIdx, EltIdx; int Mask[4]; for (EltIdx = 0; EltIdx < 4; ++EltIdx) { if (Zeroable[EltIdx]) { // The zero vector will be on the right hand side. Mask[EltIdx] = EltIdx+4; continue; } Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. EltMaskIdx = Elt.getConstantOperandVal(1); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; } if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. SDValue VZeroOrUndef = (Zeroable == Undefs) ? DAG.getUNDEF(VT) : getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getBitcast(VT, V1); return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); } // See if we can lower this build_vector to a INSERTPS. if (!Subtarget.hasSSE41()) return SDValue(); SDValue V2 = Elt.getOperand(0); if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) V1 = SDValue(); bool CanFold = true; for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { if (Zeroable[i]) continue; SDValue Current = Op->getOperand(i); SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i); } if (!CanFold) return SDValue(); assert(V1.getNode() && "Expected at least two non-zero elements!"); if (V1.getSimpleValueType() != MVT::v4f32) V1 = DAG.getBitcast(MVT::v4f32, V1); if (V2.getSimpleValueType() != MVT::v4f32) V2 = DAG.getBitcast(MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. unsigned ZMask = Zeroable.to_ulong(); unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDLoc DL(Op); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getIntPtrConstant(InsertPSMask, DL)); return DAG.getBitcast(VT, Result); } /// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); MVT ShVT = MVT::v16i8; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. if (LoadSDNode *LD = dyn_cast(SrcOp)) { SDValue Ptr = LD->getBasePtr(); if (!ISD::isNormalLoad(LD) || LD->isVolatile()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) return SDValue(); int FI = -1; int64_t Offset = 0; if (FrameIndexSDNode *FINode = dyn_cast(Ptr)) { FI = FINode->getIndex(); Offset = 0; } else if (DAG.isBaseWithConstantOffset(Ptr) && isa(Ptr.getOperand(0))) { FI = cast(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); Ptr = Ptr.getOperand(0); } else { return SDValue(); } // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { if (MFI.isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { MFI.setObjectAlignment(FI, RequiredAlign); } } // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(StartOffset, DL, Ptr.getValueType())); } int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset)); SmallVector Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask); } return SDValue(); } /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example: -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool isAfterLegalize) { unsigned NumElems = Elts.size(); int LastLoadedElt = -1; APInt LoadMask = APInt::getNullValue(NumElems); APInt ZeroMask = APInt::getNullValue(NumElems); APInt UndefMask = APInt::getNullValue(NumElems); SmallVector Loads(NumElems, nullptr); // For each element in the initializer, see if we've found a load, zero or an // undef. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = peekThroughBitcasts(Elts[i]); if (!Elt.getNode()) return SDValue(); if (Elt.isUndef()) { UndefMask.setBit(i); continue; } if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) { ZeroMask.setBit(i); continue; } // Each loaded element must be the correct fractional portion of the // requested vector load. if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) return SDValue(); if (!ISD::isNON_EXTLoad(Elt.getNode())) return SDValue(); Loads[i] = cast(Elt); LoadMask.setBit(i); LastLoadedElt = i; } assert((ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && "Incomplete element masks"); // Handle Special Cases - all undef or undef/zero. if (UndefMask.countPopulation() == NumElems) return DAG.getUNDEF(VT); // FIXME: Should we return this as a BUILD_VECTOR instead? if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems) return VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); int FirstLoadedElt = LoadMask.countTrailingZeros(); SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); LoadSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits; assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a // an additional shuffle stage to clear the ZERO elements. bool IsConsecutiveLoad = true; bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes, i - FirstLoadedElt)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; } } else if (ZeroMask[i]) { IsConsecutiveLoad = false; } } auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(!(MMOFlags & MachineMemOperand::MOVolatile) && "Cannot merge volatile loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; // Check if the base load is entirely dereferenceable. bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable( VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); // LOAD - all consecutive load/undefs (must start/end with a load or be // entirely dereferenceable). If we have found an entire vector of loads and // undefs, then return a large load of the entire vector width starting at the // base pointer. If the vector contains zeros, then attempt to shuffle those // elements. if (FirstLoadedElt == 0 && (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); // Don't create 256-bit non-temporal aligned loads without AVX2 as these // will lower to regular temporal loads and use the cache. if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 && VT.is256BitVector() && !Subtarget.hasInt256()) return SDValue(); if (NumElems == 1) return DAG.getBitcast(VT, Elts[FirstLoadedElt]); if (!ZeroMask) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. if (!isAfterLegalize && VT.isVector()) { SmallVector ClearMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { if (ZeroMask[i]) ClearMask[i] = i + NumElems; else if (LoadMask[i]) ClearMask[i] = i; } SDValue V = CreateLoad(VT, LDBase); SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); } } // If the upper half of a ymm/zmm load is undef then just load the lower half. if (VT.is256BitVector() || VT.is512BitVector()) { unsigned HalfNumElems = NumElems / 2; if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) { EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); SDValue HalfLD = EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, DAG, Subtarget, isAfterLegalize); if (HalfLD) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), HalfLD, DAG.getIntPtrConstant(0, DL)); } } // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && (LoadSizeInBits == 32 || LoadSizeInBits == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) : MVT::getIntegerVT(LoadSizeInBits); MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(), LDBase->getAlignment(), MachineMemOperand::MOLoad); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } // BROADCAST - match the smallest possible repetition pattern, load that // scalar/subvector element and then broadcast to the entire vector. if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() && (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) { for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) { unsigned RepeatSize = SubElems * BaseSizeInBits; unsigned ScalarSize = std::min(RepeatSize, 64u); if (!Subtarget.hasAVX2() && ScalarSize < 32) continue; bool Match = true; SmallVector RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); for (unsigned i = 0; i != NumElems && Match; ++i) { if (!LoadMask[i]) continue; SDValue Elt = peekThroughBitcasts(Elts[i]); if (RepeatedLoads[i % SubElems].isUndef()) RepeatedLoads[i % SubElems] = Elt; else Match &= (RepeatedLoads[i % SubElems] == Elt); } // We must have loads at both ends of the repetition. Match &= !RepeatedLoads.front().isUndef(); Match &= !RepeatedLoads.back().isUndef(); if (!Match) continue; EVT RepeatVT = VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64)) ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize) : EVT::getFloatingPointVT(ScalarSize); if (RepeatSize > ScalarSize) RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, RepeatSize / ScalarSize); EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), VT.getSizeInBits() / ScalarSize); if (TLI.isTypeLegal(BroadcastVT)) { if (SDValue RepeatLoad = EltsFromConsecutiveLoads( RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST : X86ISD::VBROADCAST; SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad); return DAG.getBitcast(VT, Broadcast); } } } } return SDValue(); } // Combine a vector ops (shuffles etc.) that is equal to build_vector load1, // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses // are consecutive, non-overlapping, and in the right order. static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool isAfterLegalize) { SmallVector Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { Elts.push_back(Elt); continue; } return SDValue(); } assert(Elts.size() == VT.getVectorNumElements()); return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget, isAfterLegalize); } static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); unsigned NumElm = SplatBitSize / ScalarSize; SmallVector ConstantVec; for (unsigned i = 0; i < NumElm; i++) { APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); Constant *Const; if (VT.isFloatingPoint()) { if (ScalarSize == 32) { Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); } else { assert(ScalarSize == 64 && "Unsupported floating point scalar size"); Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); } } else Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); ConstantVec.push_back(Const); } return ConstantVector::get(ArrayRef(ConstantVec)); } static bool isFoldableUseOfShuffle(SDNode *N) { for (auto *U : N->uses()) { unsigned Opc = U->getOpcode(); // VPERMV/VPERMV3 shuffles can never fold their index operands. if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) return false; if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) return false; if (isTargetShuffle(Opc)) return true; if (Opc == ISD::BITCAST) // Ignore bitcasts return isFoldableUseOfShuffle(U); if (N->hasOneUse()) return true; } return false; } // Check if the current node of build vector is a zero extended vector. // // If so, return the value extended. // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a. // // NumElt - return the number of zero extended identical values. // // EltType - return the type of the value include the zero extend. static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op, unsigned &NumElt, MVT &EltType) { SDValue ExtValue = Op->getOperand(0); unsigned NumElts = Op->getNumOperands(); unsigned Delta = NumElts; for (unsigned i = 1; i < NumElts; i++) { if (Op->getOperand(i) == ExtValue) { Delta = i; break; } if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i)))) return SDValue(); } if (!isPowerOf2_32(Delta) || Delta == 1) return SDValue(); for (unsigned i = Delta; i < NumElts; i++) { if (i % Delta == 0) { if (Op->getOperand(i) != ExtValue) return SDValue(); } else if (!(isNullConstant(Op->getOperand(i)) || Op->getOperand(i).isUndef())) return SDValue(); } unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits(); unsigned ExtVTSize = EltSize * Delta; EltType = MVT::getIntegerVT(ExtVTSize); NumElt = NumElts / Delta; return ExtValue; } /// Attempt to use the vbroadcast instruction to generate a splat value /// from a splat BUILD_VECTOR which uses: /// a. A single scalar load, or a constant. /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). /// /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. if (!Subtarget.hasAVX()) return SDValue(); MVT VT = BVOp->getSimpleValueType(0); SDLoc dl(BVOp); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); BitVector UndefElements; SDValue Ld = BVOp->getSplatValue(&UndefElements); // Attempt to use VBROADCASTM // From this paterrn: // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) // b. t1 = (build_vector t0 t0) // // Create (VBROADCASTM v2i1 X) if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) { MVT EltType = VT.getScalarType(); unsigned NumElts = VT.getVectorNumElements(); SDValue BOperand; SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType); if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) || (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND && Ld.getOperand(0).getOpcode() == ISD::BITCAST)) { if (ZeroExtended) BOperand = ZeroExtended.getOperand(0); else BOperand = Ld.getOperand(0).getOperand(0); MVT MaskVT = BOperand.getSimpleValueType(); if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d SDValue Brdcst = DAG.getNode(X86ISD::VBROADCASTM, dl, MVT::getVectorVT(EltType, NumElts), BOperand); return DAG.getBitcast(VT, Brdcst); } } } unsigned NumElts = VT.getVectorNumElements(); unsigned NumUndefElts = UndefElements.count(); if (!Ld || (NumElts - NumUndefElts) <= 1) { APInt SplatValue, Undef; unsigned SplatBitSize; bool HasUndef; // Check if this is a repeated constant pattern suitable for broadcasting. if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && SplatBitSize > VT.getScalarSizeInBits() && SplatBitSize < VT.getSizeInBits()) { // Avoid replacing with broadcast when it's a use of a shuffle // instruction to preserve the present custom lowering of shuffles. if (isFoldableUseOfShuffle(BVOp)) return SDValue(); // replace BUILD_VECTOR with broadcast of the repeated constants. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); LLVMContext *Ctx = DAG.getContext(); MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); if (Subtarget.hasAVX()) { if (SplatBitSize <= 64 && Subtarget.hasAVX2() && !(SplatBitSize == 64 && Subtarget.is32Bit())) { // Splatted value can fit in one INTEGER constant in constant pool. // Load the constant and broadcast it. MVT CVT = MVT::getIntegerVT(SplatBitSize); Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize); Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, MVT::getVectorVT(CVT, Repeat), Ld); return DAG.getBitcast(VT, Brdcst); } else if (SplatBitSize == 32 || SplatBitSize == 64) { // Splatted value can fit in one FLOAT constant in constant pool. // Load the constant and broadcast it. // AVX have support for 32 and 64 bit broadcast for floats only. // No 64bit integer in 32bit subtarget. MVT CVT = MVT::getFloatingPointVT(SplatBitSize); // Lower the splat via APFloat directly, to avoid any conversion. Constant *C = SplatBitSize == 32 ? ConstantFP::get(*Ctx, APFloat(APFloat::IEEEsingle(), SplatValue)) : ConstantFP::get(*Ctx, APFloat(APFloat::IEEEdouble(), SplatValue)); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, MVT::getVectorVT(CVT, Repeat), Ld); return DAG.getBitcast(VT, Brdcst); } else if (SplatBitSize > 64) { // Load the vector of constants and broadcast it. MVT CVT = VT.getScalarType(); Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); SDValue VCP = DAG.getConstantPool(VecC, PVT); unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); unsigned Alignment = cast(VCP)->getAlignment(); Ld = DAG.getLoad( MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld); return DAG.getBitcast(VT, Brdcst); } } } // If we are moving a scalar into a vector (Ld must be set and all elements // but 1 are undef) and that operation is not obviously supported by // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast. // That's better than general shuffling and may eliminate a load to GPR and // move from scalar to vector register. if (!Ld || NumElts - NumUndefElts != 1) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64))) return SDValue(); } bool ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); // When optimizing for size, generate up to 5 extra bytes for a broadcast // instruction to save 8 or more bytes of constant pool data. // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast(Ld)) C = CF->getConstantFPValue(); assert(C && "Invalid constant type"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget.hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (Subtarget.hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) { if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } // Unsupported broadcast. return SDValue(); } /// For an EXTRACT_VECTOR_ELT with a constant index return the real /// underlying vector and index. /// /// Modifies \p ExtractedFromVec to the real vector and returns the real /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { int Idx = cast(ExtIdx)->getZExtValue(); if (!isa(ExtractedFromVec)) return Idx; // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: // (extract_vector_elt (v8f32 %1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> // (extract_subvector (v8f32 %0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index // is 2, as specified by the shuffle. ShuffleVectorSDNode *SVOp = cast(ExtractedFromVec); SDValue ShuffleVec = SVOp->getOperand(0); MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); assert(ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()); int ShuffleIdx = SVOp->getMaskElt(Idx); if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { ExtractedFromVec = ShuffleVec; return ShuffleIdx; } return Idx; } static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); SDLoc DL(Op); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; SDValue VecIn2; SmallVector InsertIndices; SmallVector Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); if (Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { // Quit if more than 1 elements need inserting. if (InsertIndices.size() > 1) return SDValue(); InsertIndices.push_back(i); continue; } SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); // Quit if non-constant index. if (!isa(ExtIdx)) return SDValue(); int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); for (unsigned Idx : InsertIndices) NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx, DL)); return NV; } static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"); uint64_t Immediate = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (!In.isUndef()) Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); return DAG.getConstant(Immediate, dl, VT); } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) return Op; if (ISD::isBuildVectorAllOnes(Op.getNode())) return Op; if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { // Split the pieces. SDValue Lower = DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32)); SDValue Upper = DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32)); // We have to manually lower both halves so getNode doesn't try to // reassemble the build_vector. Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget); Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper); } SDValue Imm = ConvertI1VectorToInteger(Op, DAG); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, Imm); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } // Vector has one or more non-const elements uint64_t Immediate = 0; SmallVector NonConstIdx; bool IsSplat = true; bool HasConstElts = false; int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.isUndef()) continue; if (!isa(In)) NonConstIdx.push_back(idx); else { Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; HasConstElts = true; } if (SplatIdx < 0) SplatIdx = idx; else if (In != Op.getOperand(SplatIdx)) IsSplat = false; } // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx), DAG.getConstant(1, dl, VT), DAG.getConstant(0, dl, VT)); // insert elements one by one SDValue DstVec; SDValue Imm; if (Immediate) { MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); Imm = DAG.getConstant(Immediate, dl, ImmVT); } else if (HasConstElts) Imm = DAG.getConstant(0, dl, VT); else Imm = DAG.getUNDEF(VT); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) DstVec = DAG.getBitcast(VT, Imm); else { SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) { unsigned InsertIdx = NonConstIdx[i]; DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(InsertIdx), DAG.getIntPtrConstant(InsertIdx, dl)); } return DstVec; } /// This is a helper function of LowerToHorizontalOp(). /// This function checks that the build_vector \p N in input implements a /// 128-bit partial horizontal operation on a 256-bit vector, but that operation /// may not match the layout of an x86 256-bit horizontal instruction. /// In other words, if this returns true, then some extraction/insertion will /// be required to produce a valid horizontal instruction. /// /// Parameter \p Opcode defines the kind of horizontal operation to match. /// For example, if \p Opcode is equal to ISD::ADD, then this function /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode /// is equal to ISD::SUB, then this function checks if this is a horizontal /// arithmetic sub. /// /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). /// /// TODO: This function was originally used to match both real and fake partial /// horizontal operations, but the index-matching logic is incorrect for that. /// See the corrected implementation in isHopBuildVector(). Can we reduce this /// code because it is only used for partial h-op matching now? static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"); assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. if (Op->isUndef()) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; ExpectedVExtractIdx += 2; continue; } CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) break; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0) == Op1.getOperand(0) && isa(Op0.getOperand(1)) && isa(Op1.getOperand(1))); if (!CanFold) break; unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast(Op1.getOperand(1))->getZExtValue(); if (i * 2 < NumElts) { if (V0.isUndef()) { V0 = Op0.getOperand(0); if (V0.getValueType() != VT) return false; } } else { if (V1.isUndef()) { V1 = Op0.getOperand(0); if (V1.getValueType() != VT) return false; } if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; if (I0 == ExpectedVExtractIdx) CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; else if (IsCommutable && I1 == ExpectedVExtractIdx) { // Try to match the following dag sequence: // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; } else CanFold = false; ExpectedVExtractIdx += 2; } return CanFold; } /// Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// /// This is a helper function of LowerToHorizontalOp(). /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two /// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to /// the two new horizontal binop. /// When Mode is set, the first horizontal binop dag node would take as input /// the lower 128-bit of V0 and the upper 128-bit of V0. The second /// horizontal binop dag node would take as input the lower 128-bit of V1 /// and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V0_HI /// HADD V1_LO, V1_HI /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI /// /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { MVT VT = V0.getSimpleValueType(); assert(VT.is256BitVector() && VT == V1.getSimpleValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL); SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL); SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL); SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL); MVT NewVT = V0_LO.getSimpleValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && !V0->isUndef()) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); if (!isUndefHI && !V1->isUndef()) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef())) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef())) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } /// Returns true iff \p BV builds a vector with the result equivalent to /// the result of ADDSUB/SUBADD operation. /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters /// \p Opnd0 and \p Opnd1. static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd) { MVT VT = BV->getSimpleValueType(0); if (!Subtarget.hasSSE3() || !VT.isFloatingPoint()) return false; unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); NumExtracts = 0; // Odd-numbered elements in the input build vector are obtained from // adding/subtracting two integer/float elements. // Even-numbered elements in the input build vector are obtained from // subtracting/adding two integer/float elements. unsigned Opc[2] = {0, 0}; for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) continue; // Early exit if we found an unexpected opcode. if (Opcode != ISD::FADD && Opcode != ISD::FSUB) return false; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) // Early exit if we cannot match that sequence. if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op0.getOperand(1)) || !isa(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) return false; unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); if (I0 != i) return false; // We found a valid add/sub node, make sure its the same opcode as previous // elements for this parity. if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode) return false; Opc[i % 2] = Opcode; // Update InVec0 and InVec1. if (InVec0.isUndef()) { InVec0 = Op0.getOperand(0); if (InVec0.getSimpleValueType() != VT) return false; } if (InVec1.isUndef()) { InVec1 = Op1.getOperand(0); if (InVec1.getSimpleValueType() != VT) return false; } // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (Opcode == ISD::FSUB) return false; // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) return false; } if (InVec1 != Op1.getOperand(0)) return false; // Increment the number of extractions done. ++NumExtracts; } // Ensure we have found an opcode for both parities and that they are // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the // inputs are undef. if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] || InVec0.isUndef() || InVec1.isUndef()) return false; IsSubAdd = Opc[0] == ISD::FADD; Opnd0 = InVec0; Opnd1 = InVec1; return true; } /// Returns true if is possible to fold MUL and an idiom that has already been /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. /// /// Prior to calling this function it should be known that there is some /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called /// before replacement of such SDNode with ADDSUB operation. Thus the number /// of \p Opnd0 uses is expected to be equal to 2. /// For example, this function may be called for the following IR: /// %AB = fmul fast <2 x double> %A, %B /// %Sub = fsub fast <2 x double> %AB, %C /// %Add = fadd fast <2 x double> %AB, %C /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, /// <2 x i32> /// There is a def for %Addsub here, which potentially can be replaced by /// X86ISD::ADDSUB operation: /// %Addsub = X86ISD::ADDSUB %AB, %C /// and such ADDSUB can further be replaced with FMADDSUB: /// %Addsub = FMADDSUB %A, %B, %C. /// /// The main reason why this method is called before the replacement of the /// recognized ADDSUB idiom with ADDSUB operation is that such replacement /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses) { if (Opnd0.getOpcode() != ISD::FMUL || !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; // FIXME: These checks must match the similar ones in // DAGCombiner::visitFADDForFMACombine. It would be good to have one // function that would answer if it is Ok to fuse MUL + ADD to FMADD // or MUL + ADDSUB to FMADDSUB. const TargetOptions &Options = DAG.getTarget().Options; bool AllowFusion = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); if (!AllowFusion) return false; Opnd2 = Opnd1; Opnd1 = Opnd0.getOperand(1); Opnd0 = Opnd0.getOperand(0); return true; } /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or /// X86ISD::FMSUBADD node. static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; unsigned NumExtracts; bool IsSubAdd; if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd)) return SDValue(); MVT VT = BV->getSimpleValueType(0); SDLoc DL(BV); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } // We only support ADDSUB. if (IsSubAdd) return SDValue(); // Do not generate X86ISD::ADDSUB node for 512-bit types even though // the ADDSUB idiom has been successfully recognized. There are no known // X86 targets with 512-bit ADDSUB instructions! // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom // recognition. if (VT.is512BitVector()) return SDValue(); return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1) { // Initialize outputs to known values. MVT VT = BV->getSimpleValueType(0); HOpcode = ISD::DELETED_NODE; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit // half of the result is calculated independently from the 128-bit halves of // the inputs, so that makes the index-checking logic below more complicated. unsigned NumElts = VT.getVectorNumElements(); unsigned GenericOpcode = ISD::DELETED_NODE; unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1; unsigned NumEltsIn128Bits = NumElts / Num128BitChunks; unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2; for (unsigned i = 0; i != Num128BitChunks; ++i) { for (unsigned j = 0; j != NumEltsIn128Bits; ++j) { // Ignore undef elements. SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j); if (Op.isUndef()) continue; // If there's an opcode mismatch, we're done. if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode) return false; // Initialize horizontal opcode. if (HOpcode == ISD::DELETED_NODE) { GenericOpcode = Op.getOpcode(); switch (GenericOpcode) { case ISD::ADD: HOpcode = X86ISD::HADD; break; case ISD::SUB: HOpcode = X86ISD::HSUB; break; case ISD::FADD: HOpcode = X86ISD::FHADD; break; case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; default: return false; } } SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op0.getOperand(0) != Op1.getOperand(0) || !isa(Op0.getOperand(1)) || !isa(Op1.getOperand(1)) || !Op.hasOneUse()) return false; // The source vector is chosen based on which 64-bit half of the // destination vector is being calculated. if (j < NumEltsIn64Bits) { if (V0.isUndef()) V0 = Op0.getOperand(0); } else { if (V1.isUndef()) V1 = Op0.getOperand(0); } SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1; if (SourceVec != Op0.getOperand(0)) return false; // op (extract_vector_elt A, I), (extract_vector_elt A, I+1) unsigned ExtIndex0 = Op0.getConstantOperandVal(1); unsigned ExtIndex1 = Op1.getConstantOperandVal(1); unsigned ExpectedIndex = i * NumEltsIn128Bits + (j % NumEltsIn64Bits) * 2; if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1) continue; // If this is not a commutative op, this does not match. if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD) return false; // Addition is commutative, so try swapping the extract indexes. // op (extract_vector_elt A, I+1), (extract_vector_elt A, I) if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1) continue; // Extract indexes do not match horizontal requirement. return false; } } // We matched. Opcode and operands are returned by reference as arguments. return true; } static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1) { // If either input vector is not the same size as the build vector, // extract/insert the low bits to the correct size. // This is free (examples: zmm --> xmm, xmm --> ymm). MVT VT = BV->getSimpleValueType(0); unsigned Width = VT.getSizeInBits(); if (V0.getValueSizeInBits() > Width) V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width); else if (V0.getValueSizeInBits() < Width) V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width); if (V1.getValueSizeInBits() > Width) V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width); else if (V1.getValueSizeInBits() < Width) V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width); unsigned NumElts = VT.getVectorNumElements(); APInt DemandedElts = APInt::getAllOnesValue(NumElts); for (unsigned i = 0; i != NumElts; ++i) if (BV->getOperand(i).isUndef()) DemandedElts.clearBit(i); // If we don't need the upper xmm, then perform as a xmm hop. unsigned HalfNumElts = NumElts / 2; if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts); V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256); } return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We need at least 2 non-undef elements to make this worthwhile by default. unsigned NumNonUndefs = count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); }); if (NumNonUndefs < 2) return SDValue(); // There are 4 sets of horizontal math operations distinguished by type: // int/FP at 128-bit/256-bit. Each type was introduced with a different // subtarget feature. Try to match those "native" patterns first. MVT VT = BV->getSimpleValueType(0); if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) || ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) || ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) || ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) { unsigned HOpcode; SDValue V0, V1; if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); } // Try harder to match 256-bit ops by using extract/concat. if (!Subtarget.hasAVX() || !VT.is256BitVector()) return SDValue(); // Count the number of UNDEF operands in the build_vector in input. unsigned NumElts = VT.getVectorNumElements(); unsigned Half = NumElts / 2; unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; for (unsigned i = 0, e = Half; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsHI++; SDLoc DL(BV); SDValue InVec0, InVec1; if (VT == MVT::v8i32 || VT == MVT::v16i16) { SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; if (CanFold) { // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into a pair of horizontal binops followed by // a concat vector. We must adjust the outputs from the partial horizontal // matching calls above to account for undefined vector halves. SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0; SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1; assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"); bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) { unsigned X86Opcode; if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); // Don't try to expand this build_vector into a pair of horizontal add/sub // if we can simply emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into two horizontal add/sub followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, isUndefLO, isUndefHI); } return SDValue(); } /// If a BUILD_VECTOR's source elements all apply the same bit operation and /// one of their operands is constant, lower to a pair of BUILD_VECTOR and /// just apply the bit to the vectors. /// NOTE: Its not in our interest to start make a general purpose vectorizer /// from this, but enough scalar bit operations are created from the later /// legalization + scalarization stages to need basic support. static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Check that all elements have the same opcode. // TODO: Should we allow UNDEFS and if so how many? unsigned Opcode = Op->getOperand(0).getOpcode(); for (unsigned i = 1; i < NumElems; ++i) if (Opcode != Op->getOperand(i).getOpcode()) return SDValue(); // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). bool IsShift = false; switch (Opcode) { default: return SDValue(); case ISD::SHL: case ISD::SRL: case ISD::SRA: IsShift = true; break; case ISD::AND: case ISD::XOR: case ISD::OR: // Don't do this if the buildvector is a splat - we'd replace one // constant with an entire vector. if (Op->getSplatValue()) return SDValue(); if (!TLI.isOperationLegalOrPromote(Opcode, VT)) return SDValue(); break; } SmallVector LHSElts, RHSElts; for (SDValue Elt : Op->ops()) { SDValue LHS = Elt.getOperand(0); SDValue RHS = Elt.getOperand(1); // We expect the canonicalized RHS operand to be the constant. if (!isa(RHS)) return SDValue(); // Extend shift amounts. if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) { if (!IsShift) return SDValue(); RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType()); } LHSElts.push_back(LHS); RHSElts.push_back(RHS); } // Limit to shifts by uniform immediates. // TODO: Only accept vXi8/vXi64 special cases? // TODO: Permit non-uniform XOP/AVX2/MULLO cases? if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; })) return SDValue(); SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); return DAG.getNode(Opcode, DL, VT, LHS, RHS); } /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); // Vectors containing all zeros can be matched by pxor and xorps. if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getZeroVector(VT, Subtarget, DAG, DL); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || VT == MVT::v16i32 || (VT == MVT::v8i32 && Subtarget.hasInt256())) return Op; return getOnesVector(VT, DAG, DL); } return SDValue(); } /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute /// from a vector of source values and a vector of extraction indices. /// The vectors might be manipulated to match the type of the permute op. static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT ShuffleVT = VT; EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); unsigned NumElts = VT.getVectorNumElements(); unsigned SizeInBits = VT.getSizeInBits(); // Adjust IndicesVec to match VT size. assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts && "Illegal variable permute mask size"); if (IndicesVec.getValueType().getVectorNumElements() > NumElts) IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec), NumElts * VT.getScalarSizeInBits()); IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); // Handle SrcVec that don't match VT type. if (SrcVec.getValueSizeInBits() != SizeInBits) { if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) { // Handle larger SrcVec by treating it as a larger permute. unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits; VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts); IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec)); return extractSubVector( createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0, DAG, DL, SizeInBits); } else if (SrcVec.getValueSizeInBits() < SizeInBits) { // Widen smaller SrcVec to match VT. SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); } else return SDValue(); } auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) { assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"); EVT SrcVT = Idx.getValueType(); unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale; uint64_t IndexScale = 0; uint64_t IndexOffset = 0; // If we're scaling a smaller permute op, then we need to repeat the // indices, scaling and offsetting them as well. // e.g. v4i32 -> v16i8 (Scale = 4) // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4) // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0) for (uint64_t i = 0; i != Scale; ++i) { IndexScale |= Scale << (i * NumDstBits); IndexOffset |= i << (i * NumDstBits); } Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx, DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT)); Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx, DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT)); return Idx; }; unsigned Opcode = 0; switch (VT.SimpleTy) { default: break; case MVT::v16i8: if (Subtarget.hasSSSE3()) Opcode = X86ISD::PSHUFB; break; case MVT::v8i16: if (Subtarget.hasVLX() && Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasSSSE3()) { Opcode = X86ISD::PSHUFB; ShuffleVT = MVT::v16i8; } break; case MVT::v4f32: case MVT::v4i32: if (Subtarget.hasAVX()) { Opcode = X86ISD::VPERMILPV; ShuffleVT = MVT::v4f32; } else if (Subtarget.hasSSSE3()) { Opcode = X86ISD::PSHUFB; ShuffleVT = MVT::v16i8; } break; case MVT::v2f64: case MVT::v2i64: if (Subtarget.hasAVX()) { // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); Opcode = X86ISD::VPERMILPV; ShuffleVT = MVT::v2f64; } else if (Subtarget.hasSSE41()) { // SSE41 can compare v2i64 - select between indices 0 and 1. return DAG.getSelectCC( DL, IndicesVec, getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL), DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}), DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}), ISD::CondCode::SETEQ); } break; case MVT::v32i8: if (Subtarget.hasVLX() && Subtarget.hasVBMI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasXOP()) { SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL); SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL); SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL); SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL); return DAG.getNode( ISD::CONCAT_VECTORS, DL, VT, DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx), DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx)); } else if (Subtarget.hasAVX()) { SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL); SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL); SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo); SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi); auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { // Permute Lo and Hi and then select based on index range. // This works as SHUFB uses bits[3:0] to permute elements and we don't // care about the bit[7] as its just an index vector. SDValue Idx = Ops[2]; EVT VT = Idx.getValueType(); return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT), DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx), DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx), ISD::CondCode::SETGT); }; SDValue Ops[] = {LoLo, HiHi, IndicesVec}; return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops, PSHUFBBuilder); } break; case MVT::v16i16: if (Subtarget.hasVLX() && Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasAVX()) { // Scale to v32i8 and perform as v32i8. IndicesVec = ScaleIndices(IndicesVec, 2); return DAG.getBitcast( VT, createVariablePermute( MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec), DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget)); } break; case MVT::v8f32: case MVT::v8i32: if (Subtarget.hasAVX2()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {0, 1, 2, 3, 0, 1, 2, 3}); SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {4, 5, 6, 7, 4, 5, 6, 7}); if (Subtarget.hasXOP()) return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi, IndicesVec, DAG.getConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPS only uses index bits[0:1] to permute elements. SDValue Res = DAG.getSelectCC( DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec), ISD::CondCode::SETGT); return DAG.getBitcast(VT, Res); } break; case MVT::v4i64: case MVT::v4f64: if (Subtarget.hasAVX512()) { if (!Subtarget.hasVLX()) { MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8); SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec)); SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL, DAG, Subtarget); return extract256BitVector(Res, 0, DAG, DL); } Opcode = X86ISD::VPERMV; } else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1}); SDValue HiHi = DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3}); // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); if (Subtarget.hasXOP()) return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi, IndicesVec, DAG.getConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPD only uses index bit[1] to permute elements. SDValue Res = DAG.getSelectCC( DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec), ISD::CondCode::SETGT); return DAG.getBitcast(VT, Res); } break; case MVT::v64i8: if (Subtarget.hasVBMI()) Opcode = X86ISD::VPERMV; break; case MVT::v32i16: if (Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; break; case MVT::v16f32: case MVT::v16i32: case MVT::v8f64: case MVT::v8i64: if (Subtarget.hasAVX512()) Opcode = X86ISD::VPERMV; break; } if (!Opcode) return SDValue(); assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && "Illegal variable permute shuffle type"); uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits(); if (Scale > 1) IndicesVec = ScaleIndices(IndicesVec, Scale); EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger(); IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec); SrcVec = DAG.getBitcast(ShuffleVT, SrcVec); SDValue Res = Opcode == X86ISD::VPERMV ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec) : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec); return DAG.getBitcast(VT, Res); } // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be // reasoned to be a permutation of a vector by indices in a non-constant vector. // (build_vector (extract_elt V, (extract_elt I, 0)), // (extract_elt V, (extract_elt I, 1)), // ... // -> // (vpermv I, V) // // TODO: Handle undefs // TODO: Utilize pshufb and zero mask blending to support more efficient // construction of vectors with constant-0 elements. static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue SrcVec, IndicesVec; // Check for a match of the permute source vector and permute index elements. // This is done by checking that the i-th build_vector operand is of the form: // (extract_elt SrcVec, (extract_elt IndicesVec, i)). for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { SDValue Op = V.getOperand(Idx); if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract encountered in V, set the source vector, // otherwise verify the extract is from the previously defined source // vector. if (!SrcVec) SrcVec = Op.getOperand(0); else if (SrcVec != Op.getOperand(0)) return SDValue(); SDValue ExtractedIndex = Op->getOperand(1); // Peek through extends. if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) ExtractedIndex = ExtractedIndex.getOperand(0); if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract from the index vector candidate, set the // indices vector, otherwise verify the extract is from the previously // defined indices vector. if (!IndicesVec) IndicesVec = ExtractedIndex.getOperand(0); else if (IndicesVec != ExtractedIndex.getOperand(0)) return SDValue(); auto *PermIdx = dyn_cast(ExtractedIndex.getOperand(1)); if (!PermIdx || PermIdx->getZExtValue() != Idx) return SDValue(); } SDLoc DL(V); MVT VT = V.getSimpleValueType(); return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget); if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) return VectorConstant; BuildVectorSDNode *BV = cast(Op.getNode()); if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) return HorizontalOp; if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) return Broadcast; if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG)) return BitOp; unsigned EVTBits = EltVT.getSizeInBits(); unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet Values; unsigned NumConstants = NumElems; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.isUndef()) continue; Values.insert(Elt); if (!isa(Elt) && !isa(Elt)) { IsAllConstants = false; NumConstants--; } if (X86::isZeroNode(Elt)) NumZero++; else { assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. NonZeros |= ((uint64_t)1 << i); NumNonZero++; } } // All undef vector. Return an UNDEF. All zero vectors were handled above. if (NumNonZero == 0) return DAG.getUNDEF(VT); // If we are inserting one variable into a vector of non-zero constants, try // to avoid loading each constant element as a scalar. Load the constants as a // vector and then insert the variable scalar element. If insertion is not // supported, fall back to a shuffle to get the scalar blended with the // constants. Insertion into a zero vector is handled as a special-case // somewhere below here. if (NumConstants == NumElems - 1 && NumNonZero != 1 && (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { // Create an all-constant vector. The variable element in the old // build vector is replaced by undef in the constant vector. Save the // variable scalar element and its index for use in the insertelement. LLVMContext &Context = *DAG.getContext(); Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context); SmallVector ConstVecOps(NumElems, UndefValue::get(EltType)); SDValue VarElt; SDValue InsIndex; for (unsigned i = 0; i != NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue()); else if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF()); else if (!Elt.isUndef()) { assert(!VarElt.getNode() && !InsIndex.getNode() && "Expected one variable element in this vector"); VarElt = Elt; InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout())); } } Constant *CV = ConstantVector::get(ConstVecOps); SDValue DAGConstVec = DAG.getConstantPool(CV, VT); // The constants we just created may not be legal (eg, floating point). We // must lower the vector right here because we can not guarantee that we'll // legalize it before loading it. This is also why we could not just create // a new build vector here. If the build vector contains illegal constants, // it could get split back up into a series of insert elements. // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD. SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG); MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); unsigned InsertC = cast(InsIndex)->getZExtValue(); unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits(); if (InsertC < NumEltsInLow128Bits) return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); // There's no good way to insert into the high elements of a >128-bit // vector, so use shuffles to avoid an extract/insert sequence. assert(VT.getSizeInBits() > 128 && "Invalid insertion index?"); assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector"); SmallVector ShuffleMask; unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 0; i != NumElts; ++i) ShuffleMask.push_back(i == InsertC ? NumElts : i); SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt); return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask); } // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit())) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } // We can't directly insert an i8 or i16 into a vector, so zero extend // it to i32 first. if (EltVT == MVT::i16 || EltVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); if (VT.getSizeInBits() >= 256) { MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); if (Subtarget.hasAVX()) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } else { // Without AVX, we need to extend to a 128-bit vector and then // insert into the 256-bit vector. Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl); Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl); } } else { assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getBitcast(VT, Item); } } // Is it a vector logical left shift? if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), NumBits/2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); // Otherwise, if this is a vector with i32 or f32 elements, and the element // is a non-constant being inserted into an element other than the low one, // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) { if (EVTBits == 32) { // Instead of a shuffle like this: // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); } return SDValue(); } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) return SDValue(); if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget)) return V; // See if we can use a vector load to get all of the elements. { SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElems); if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) return LD; } // If this is a splat of pairs of 32-bit elements, we can use a narrower // build_vector and broadcast it. // TODO: We could probably generalize this more. if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef Ops) { // Make sure all the even/odd operands match. for (unsigned i = 2; i != NumElems; ++i) if (Ops[i % 2] != Op.getOperand(i)) return false; return true; }; if (CanSplat(Op, NumElems, Ops)) { MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; MVT NarrowVT = MVT::getVectorVT(EltVT, 4); // Create a new build vector and cast to v2i64/v2f64. SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2), DAG.getBuildVector(NarrowVT, dl, Ops)); // Broadcast from v2i64/v2f64 and cast to final VT. MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2); return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, NewBV)); } } // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.getSizeInBits() > 128) { MVT HVT = MVT::getVectorVT(EltVT, NumElems/2); // Build both the lower and upper subvector. SDValue Lower = DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2)); SDValue Upper = DAG.getBuildVector( HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); // Recreate the wider vector with the lower and upper part. return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl, VT.getSizeInBits() / 2); } // Let legalizer expand 2-wide build_vectors. if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget)) return V; if (EVTBits == 16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. if (NumElems == 4 && NumZero > 0) { SmallVector Ops(NumElems); for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1ULL << i)); if (isZero) Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); else Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch ((NonZeros >> (i*2)) & 0x3) { default: llvm_unreachable("Unexpected NonZero count"); case 0: Ops[i] = Ops[i*2]; // Must be a zero vector. break; case 1: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); break; case 2: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; case 3: Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; } } bool Reverse1 = (NonZeros & 0x3) == 2; bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, static_cast(Reverse2 ? NumElems+1 : NumElems), static_cast(Reverse2 ? NumElems : NumElems+1) }; return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); } assert(Values.size() > 1 && "Expected non-undef and non-splat vector"); // Check for a build vector from mostly shuffle plus few inserting. if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. if (Subtarget.hasSSE41()) { SDValue Result; if (!Op.getOperand(0).isUndef()) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).isUndef()) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return Result; } // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). SmallVector Ops(NumElems); for (unsigned i = 0; i < NumElems; ++i) { if (!Op.getOperand(i).isUndef()) Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else Ops[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 1 ==> X: // : unpcklps 2, 3 ==> Y: // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { // Generate scaled UNPCKL shuffle mask. SmallVector Mask; for(unsigned i = 0; i != Scale; ++i) Mask.push_back(i); for (unsigned i = 0; i != Scale; ++i) Mask.push_back(NumElems+i); Mask.append(NumElems - Mask.size(), SM_SentinelUndef); for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); } return Ops[0]; } // 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. // TODO: Detect subvector broadcast here instead of DAG combine? static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); unsigned NumOperands = Op.getNumOperands(); unsigned NumZero = 0; unsigned NumNonZero = 0; unsigned NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; if (ISD::isBuildVectorAllZeros(SubVec.getNode())) ++NumZero; else { assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. NonZeros |= 1 << i; ++NumNonZero; } } // If we have more than 2 non-zeros, build each half separately. if (NumNonZero > 2) { MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(NumOperands/2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } // Otherwise, build it up through insert_subvectors. SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) : DAG.getUNDEF(ResVT); MVT SubVT = Op.getOperand(0).getSimpleValueType(); unsigned NumSubElems = SubVT.getVectorNumElements(); for (unsigned i = 0; i != NumOperands; ++i) { if ((NonZeros & (1 << i)) == 0) continue; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i), DAG.getIntPtrConstant(i * NumSubElems, dl)); } return Vec; } // Returns true if the given node is a type promotion (by concatenating i1 // zeros) of the result of a node that already zeros all upper bits of // k-register. // TODO: Merge this with LowerAVXCONCAT_VECTORS? static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG & DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); unsigned NumOperands = Op.getNumOperands(); assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; if (ISD::isBuildVectorAllZeros(SubVec.getNode())) ++NumZero; else { assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. NonZeros |= (uint64_t)1 << i; ++NumNonZero; } } // If there are zero or one non-zeros we can handle this very simply. if (NumNonZero <= 1) { SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) : DAG.getUNDEF(ResVT); if (!NumNonZero) return Vec; unsigned Idx = countTrailingZeros(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, DAG.getIntPtrConstant(Idx * SubVecNumElts, dl)); } if (NumOperands > 2) { MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(NumOperands/2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } assert(NumNonZero == 2 && "Simple cases not handled?"); if (ResVT.getVectorNumElements() >= 16) return Op; // The operation is legal with KUNPCK SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT), Op.getOperand(0), DAG.getIntPtrConstant(0, dl)); unsigned NumElems = ResVT.getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), DAG.getIntPtrConstant(NumElems/2, dl)); } static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT.getVectorElementType() == MVT::i1) return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); // AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget); } //===----------------------------------------------------------------------===// // Vector shuffle lowering // // This is an experimental code path for lowering vector shuffles on x86. It is // designed to handle arbitrary vector shuffles and blends, gracefully // degrading performance as necessary. It works hard to recognize idiomatic // shuffles and lower them to optimal instruction patterns without leaving // a framework that allows reasonably efficient handling of all vector shuffle // patterns. //===----------------------------------------------------------------------===// /// Tiny helper function to identify a no-op mask. /// /// This is a somewhat boring predicate function. It checks whether the mask /// array input, which is assumed to be a single-input shuffle mask of the kind /// used by the X86 shuffle instructions (not a fully general /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef Mask) { for (int i = 0, Size = Mask.size(); i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] >= 0 && Mask[i] != i) return false; } return true; } /// Test whether there are elements crossing 128-bit lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { int LaneSize = 128 / VT.getScalarSizeInBits(); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) return true; return false; } /// Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same /// lane-relative shuffle in each sub-lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is /// non-trivial to compute in the face of undef lanes. The representation is /// suitable for use with existing 128-bit shuffles as entries from the second /// vector have been remapped to [LaneSize, 2*LaneSize). static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0); if (Mask[i] < 0) continue; if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. // Adjust second vector indices to start at LaneSize instead of Size. int LocalM = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; if (RepeatedMask[i % LaneSize] < 0) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// Test whether a shuffle mask is equivalent within each 128-bit lane. static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask) { SmallVector RepeatedMask; return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } /// Test whether a shuffle mask is equivalent within each 256-bit lane. static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); } /// Test whether a target shuffle mask is equivalent within each sub-lane. /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, SM_SentinelUndef); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0)); if (Mask[i] == SM_SentinelUndef) continue; if (Mask[i] == SM_SentinelZero) { if (!isUndefOrZero(RepeatedMask[i % LaneSize])) return false; RepeatedMask[i % LaneSize] = SM_SentinelZero; continue; } if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. // Adjust second vector indices to start at LaneSize instead of Size. int LocalM = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef Mask, ArrayRef ExpectedMask) { if (Mask.size() != ExpectedMask.size()) return false; int Size = Mask.size(); // If the values are build vectors, we can look through them to find // equivalent inputs that make the shuffles equivalent. auto *BV1 = dyn_cast(V1); auto *BV2 = dyn_cast(V2); for (int i = 0; i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) { auto *MaskBV = Mask[i] < Size ? BV1 : BV2; auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; if (!MaskBV || !ExpectedBV || MaskBV->getOperand(Mask[i] % Size) != ExpectedBV->getOperand(ExpectedMask[i] % Size)) return false; } } return true; } /// Checks whether a target shuffle mask is equivalent to an explicit pattern. /// /// The masks must be exactly the same width. /// /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding /// value in ExpectedMask is always accepted. Otherwise the indices must match. /// /// SM_SentinelZero is accepted as a valid negative index but must match in both. static bool isTargetShuffleEquivalent(ArrayRef Mask, ArrayRef ExpectedMask) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && "Illegal target shuffle mask"); for (int i = 0; i < Size; ++i) if (Mask[i] == SM_SentinelUndef) continue; else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero) return false; else if (Mask[i] != ExpectedMask[i]) return false; return true; } // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle // mask. static SmallVector createTargetShuffleMask(ArrayRef Mask, const APInt &Zeroable) { int NumElts = Mask.size(); assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes"); SmallVector TargetMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index"); TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M); } return TargetMask; } // Attempt to create a shuffle mask from a VSELECT condition mask. static bool createShuffleMaskFromVSELECT(SmallVectorImpl &Mask, SDValue Cond) { if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return false; unsigned Size = Cond.getValueType().getVectorNumElements(); Mask.resize(Size, SM_SentinelUndef); for (int i = 0; i != (int)Size; ++i) { SDValue CondElt = Cond.getOperand(i); Mask[i] = i; // Arbitrarily choose from the 2nd operand if the select condition element // is undef. // TODO: Can we do better by matching patterns such as even/odd? if (CondElt.isUndef() || isNullConstant(CondElt)) Mask[i] += Size; } return true; } // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd // instructions. static bool isUnpackWdShuffleMask(ArrayRef Mask, MVT VT) { if (VT != MVT::v8i32 && VT != MVT::v8f32) return false; SmallVector Unpcklwd; createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, /* Unary = */ false); SmallVector Unpckhwd; createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, /* Unary = */ false); bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) || isTargetShuffleEquivalent(Mask, Unpckhwd)); return IsUnpackwdMask; } static bool is128BitUnpackShuffleMask(ArrayRef Mask) { // Create 128-bit vector type based on mask size. MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); MVT VT = MVT::getVectorVT(EltVT, Mask.size()); // We can't assume a canonical shuffle mask, so try the commuted version too. SmallVector CommutedMask(Mask.begin(), Mask.end()); ShuffleVectorSDNode::commuteMask(CommutedMask); // Match any of unary/binary or low/high. for (unsigned i = 0; i != 4; ++i) { SmallVector UnpackMask; createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); if (isTargetShuffleEquivalent(Mask, UnpackMask) || isTargetShuffleEquivalent(CommutedMask, UnpackMask)) return true; } return false; } /// Return true if a shuffle mask chooses elements identically in its top and /// bottom halves. For example, any splat mask has the same top and bottom /// halves. If an element is undefined in only one half of the mask, the halves /// are not considered identical. static bool hasIdenticalHalvesShuffleMask(ArrayRef Mask) { assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask"); unsigned HalfSize = Mask.size() / 2; for (unsigned i = 0; i != HalfSize; ++i) { if (Mask[i] != Mask[i + HalfSize]) return false; } return true; } /// Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. static unsigned getV4X86ShuffleImm(ArrayRef Mask) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); unsigned Imm = 0; Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; return Imm; } static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } /// Compute whether each element of a shuffle is zeroable. /// /// A "zeroable" vector shuffle element is one which can be lowered to zero. /// Either it is an undef element in the shuffle mask, the element of the input /// referenced is undef, or the element of the input referenced is known to be /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. static APInt computeZeroableShuffleElements(ArrayRef Mask, SDValue V1, SDValue V2) { APInt Zeroable(Mask.size(), 0); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); int VectorSizeInBits = V1.getValueSizeInBits(); int ScalarSizeInBits = VectorSizeInBits / Mask.size(); assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { Zeroable.setBit(i); continue; } // Determine shuffle input and normalize the mask. SDValue V = M < Size ? V1 : V2; M %= Size; // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. if (V.getOpcode() != ISD::BUILD_VECTOR) continue; // If the BUILD_VECTOR has fewer elements then the bitcasted portion of // the (larger) source element must be UNDEF/ZERO. if ((Size % V.getNumOperands()) == 0) { int Scale = Size / V->getNumOperands(); SDValue Op = V.getOperand(M / Scale); if (Op.isUndef() || X86::isZeroNode(Op)) Zeroable.setBit(i); else if (ConstantSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getAPIntValue(); Val.lshrInPlace((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); if (Val == 0) Zeroable.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); Val.lshrInPlace((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); if (Val == 0) Zeroable.setBit(i); } continue; } // If the BUILD_VECTOR has more elements then all the (smaller) source // elements must be UNDEF or ZERO. if ((V.getNumOperands() % Size) == 0) { int Scale = V->getNumOperands() / Size; bool AllZeroable = true; for (int j = 0; j < Scale; ++j) { SDValue Op = V.getOperand((M * Scale) + j); AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); } if (AllZeroable) Zeroable.setBit(i); continue; } } return Zeroable; } // The Shuffle result is as follow: // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. // Each Zeroable's element correspond to a particular Mask's element. // As described in computeZeroableShuffleElements function. // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef Mask, const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. for (int i = 0, e = Mask.size(); i < e; i++) { // Checks if the mask's zeros elements are built from only zeros. assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] < 0) return false; if (Zeroable[i]) continue; // Find the lowest non zero element if (NextElement < 0) { NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; IsZeroSideLeft = NextElement != 0; } // Exit if the mask's non zero elements are not in increasing order. if (NextElement != Mask[i]) return false; NextElement++; } return true; } /// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); const int NumBytes = VT.getSizeInBits() / 8; const int NumEltBytes = VT.getScalarSizeInBits() / 8; assert((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())); SmallVector PSHUFBMask(NumBytes); // Sign bit set in i8 mask means zero element. SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); SDValue V; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / NumEltBytes]; if (M < 0) { PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); continue; } if (Zeroable[i / NumEltBytes]) { PSHUFBMask[i] = ZeroMask; continue; } // We can only use a single input of V1 or V2. SDValue SrcV = (M >= Size ? V2 : V1); if (V && V != SrcV) return SDValue(); V = SrcV; M %= Size; // PSHUFB can't cross lanes, ensure this doesn't happen. if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) return SDValue(); M = M % LaneSize; M = M * NumEltBytes + (i % NumEltBytes); PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); } assert(V && "Failed to find a source input"); MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V), DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl); // X86 has dedicated shuffle that can be lowered to VEXPAND static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsLeftZeroSide = true; if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) return SDValue(); unsigned VEXPANDMask = (~Zeroable).getZExtValue(); MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); unsigned NumElts = VT.getVectorNumElements(); assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"); SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); } static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; for (int i = 0; i != NumElts; i += 2) { int M1 = TargetMask[i + 0]; int M2 = TargetMask[i + 1]; Undef1 &= (SM_SentinelUndef == M1); Undef2 &= (SM_SentinelUndef == M2); Zero1 &= isUndefOrZero(M1); Zero2 &= isUndefOrZero(M2); } assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) && "Zeroable shuffle detected"); // Attempt to match the target mask against the unpack lo/hi mask patterns. SmallVector Unpckl, Unpckh; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { UnpackOpcode = X86ISD::UNPCKL; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { UnpackOpcode = X86ISD::UNPCKH; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } // If an unary shuffle, attempt to match as an unpack lo/hi with zero. if (IsUnary && (Zero1 || Zero2)) { // Don't bother if we can blend instead. if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) return false; bool MatchLo = true, MatchHi = true; for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { int M = TargetMask[i]; // Ignore if the input is known to be zero or the index is undef. if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || (M == SM_SentinelUndef)) continue; MatchLo &= (M == Unpckl[i]); MatchHi &= (M == Unpckh[i]); } if (MatchLo || MatchHi) { UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; return true; } } // If a binary shuffle, commute and try again. if (!IsUnary) { ShuffleVectorSDNode::commuteMask(Unpckl); if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { UnpackOpcode = X86ISD::UNPCKL; std::swap(V1, V2); return true; } ShuffleVectorSDNode::commuteMask(Unpckh); if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { UnpackOpcode = X86ISD::UNPCKH; std::swap(V1, V2); return true; } } return false; } // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SmallVector Unpckl; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); SmallVector Unpckh; createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); // Commute and try again. ShuffleVectorSDNode::commuteMask(Unpckl); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); ShuffleVectorSDNode::commuteMask(Unpckh); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); return SDValue(); } static bool matchVectorShuffleAsVPMOV(ArrayRef Mask, bool SwappedOps, int Delta) { int Size = (int)Mask.size(); int Split = Size / Delta; int TruncatedVectorStart = SwappedOps ? Size : 0; // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,... if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta)) return false; // The rest of the mask should not refer to the truncated vector's elements. if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart, TruncatedVectorStart + Size)) return false; return true; } // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction. // // An example is the following: // // t0: ch = EntryToken // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0 // t25: v4i32 = truncate t2 // t41: v8i16 = bitcast t25 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16, // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0> // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21 // t18: v2i64 = bitcast t51 // // Without avx512vl, this is lowered to: // // vpmovqd %zmm0, %ymm0 // vpshufb {{.*#+}} xmm0 = // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero // // But when avx512vl is available, one can just use a single vpmovdw // instruction. static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (VT != MVT::v16i8 && VT != MVT::v8i16) return SDValue(); if (Mask.size() != VT.getVectorNumElements()) return SDValue(); bool SwappedOps = false; if (!ISD::isBuildVectorAllZeros(V2.getNode())) { if (!ISD::isBuildVectorAllZeros(V1.getNode())) return SDValue(); std::swap(V1, V2); SwappedOps = true; } // Look for: // // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8> // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16> // // and similar ones. if (V1.getOpcode() != ISD::BITCAST) return SDValue(); if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE) return SDValue(); SDValue Src = V1.getOperand(0).getOperand(0); MVT SrcVT = Src.getSimpleValueType(); // The vptrunc** instructions truncating 128 bit and 256 bit vectors // are only available with avx512vl. if (!SrcVT.is512BitVector() && !Subtarget.hasVLX()) return SDValue(); // Down Convert Word to Byte is only available with avx512bw. The case with // 256-bit output doesn't contain a shuffle and is therefore not handled here. if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 && !Subtarget.hasBWI()) return SDValue(); // The first half/quarter of the mask should refer to every second/fourth // element of the vector truncated and bitcasted. if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) && !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4)) return SDValue(); return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src); } // X86 has dedicated pack instructions that can handle specific truncation // operations: PACKSS and PACKUS. static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef TargetMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NumElts = VT.getVectorNumElements(); unsigned BitSize = VT.getScalarSizeInBits(); MVT PackSVT = MVT::getIntegerVT(BitSize * 2); MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2); auto MatchPACK = [&](SDValue N1, SDValue N2) { SDValue VV1 = DAG.getBitcast(PackVT, N1); SDValue VV2 = DAG.getBitcast(PackVT, N2); if (Subtarget.hasSSE41() || PackSVT == MVT::i16) { APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize); if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) && (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) { V1 = VV1; V2 = VV2; SrcVT = PackVT; PackOpcode = X86ISD::PACKUS; return true; } } if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) && (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) { V1 = VV1; V2 = VV2; SrcVT = PackVT; PackOpcode = X86ISD::PACKSS; return true; } return false; }; // Try binary shuffle. SmallVector BinaryMask; createPackShuffleMask(VT, BinaryMask, false); if (isTargetShuffleEquivalent(TargetMask, BinaryMask)) if (MatchPACK(V1, V2)) return true; // Try unary shuffle. SmallVector UnaryMask; createPackShuffleMask(VT, UnaryMask, true); if (isTargetShuffleEquivalent(TargetMask, UnaryMask)) if (MatchPACK(V1, V1)) return true; return false; } static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, Subtarget)) return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1), DAG.getBitcast(PackVT, V2)); return SDValue(); } /// Try to emit a bitmask instruction for a shuffle. /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT MaskVT = VT; MVT EltVT = VT.getVectorElementType(); SDValue Zero, AllOnes; // Use f64 if i64 isn't legal. if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { EltVT = MVT::f64; MaskVT = MVT::getVectorVT(EltVT, Mask.size()); } MVT LogicVT = VT; if (EltVT == MVT::f32 || EltVT == MVT::f64) { Zero = DAG.getConstantFP(0.0, DL, EltVT); AllOnes = DAG.getConstantFP( APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT); LogicVT = MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size()); } else { Zero = DAG.getConstant(0, DL, EltVT); AllOnes = DAG.getAllOnesConstant(DL, EltVT); } SmallVector VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Zeroable[i]) continue; if (Mask[i] % Size != i) return SDValue(); // Not a blend. if (!V) V = Mask[i] < Size ? V1 : V2; else if (V != (Mask[i] < Size ? V1 : V2)) return SDValue(); // Can only let one input through the mask. VMaskOps[i] = AllOnes; } if (!V) return SDValue(); // No non-zeroable elements! SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); VMask = DAG.getBitcast(LogicVT, VMask); V = DAG.getBitcast(LogicVT, V); SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); return DAG.getBitcast(VT, And); } /// Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) return SDValue(); // Shuffled input! MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); } SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG); static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, MutableArrayRef TargetMask, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); BlendMask = 0; ForceV1Zero = false, ForceV2Zero = false; assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask"); // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { int M = TargetMask[i]; if (M == SM_SentinelUndef) continue; if (M == i) continue; if (M == i + Size) { BlendMask |= 1ull << i; continue; } if (M == SM_SentinelZero) { if (V1IsZeroOrUndef) { ForceV1Zero = true; TargetMask[i] = i; continue; } if (V2IsZeroOrUndef) { ForceV2Zero = true; BlendMask |= 1ull << i; TargetMask[i] = i + Size; continue; } } return false; } return true; } static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) { uint64_t ScaledMask = 0; for (int i = 0; i != Size; ++i) if (BlendMask & (1ull << i)) ScaledMask |= ((1ull << Scale) - 1) << (i * Scale); return ScaledMask; } /// Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector Mask = createTargetShuffleMask(Original, Zeroable); uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero, BlendMask)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) V1 = getZeroVector(VT, Subtarget, DAG, DL); if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); switch (VT.SimpleTy) { case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); LLVM_FALLTHROUGH; case MVT::v4f64: case MVT::v8f32: assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); LLVM_FALLTHROUGH; case MVT::v2f64: case MVT::v2i64: case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); } // Use PBLENDW for lower/upper lanes and then blend lanes. // TODO - we should allow 2 PBLENDW here and leave shuffle combine to // merge to VSELECT where useful. uint64_t LoMask = BlendMask & 0xFF; uint64_t HiMask = (BlendMask >> 8) & 0xFF; if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) { SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(LoMask, DL, MVT::i8)); SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(HiMask, DL, MVT::i8)); return DAG.getVectorShuffle( MVT::v16i16, DL, Lo, Hi, {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}); } LLVM_FALLTHROUGH; } case MVT::v32i8: assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); LLVM_FALLTHROUGH; case MVT::v16i8: { assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; // This form of blend is always done on bytes. Compute the byte vector // type. MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); // x86 allows load folding with blendvb from the 2nd source operand. But // we are still using LLVM select here (see comment below), so that's V1. // If V2 can be load-folded and V1 cannot be load-folded, then commute to // allow that load-folding possibility. if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code // generator that boolean values in the elements of an x86 vector register // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' // mapping a select to operand #1, and 'false' mapping to operand #2. The // reality in x86 is that vector masks (pre-AVX-512) use only the high bit // of the element (the remaining are ignored) and 0 in that high bit would // mean operand #1 while 1 in the high bit would mean operand #2. So while // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. SmallVector VSELECTMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) VSELECTMask.push_back( Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8)); V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( VT, DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } case MVT::v16f32: case MVT::v8f64: case MVT::v8i64: case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { // Attempt to lower to a bitmask if we can. Only if not optimizing for size. bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if (!OptForSize) { if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; } // Otherwise load an immediate into a GPR, cast to k-register, and use a // masked move. MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } default: llvm_unreachable("Not a supported integer vector type!"); } } /// Try to lower as a blend of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, bool ImmBlends = false) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector BlendMask(Mask.size(), -1); SmallVector PermuteMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); if (BlendMask[Mask[i] % Size] < 0) BlendMask[Mask[i] % Size] = Mask[i]; else if (BlendMask[Mask[i] % Size] != Mask[i]) return SDValue(); // Can't blend in the needed input! PermuteMask[i] = Mask[i] % Size; } // If only immediate blends, then bail if the blend mask can't be widened to // i16. unsigned EltSize = VT.getScalarSizeInBits(); if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask)) return SDValue(); SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } /// Try to lower as an unpack of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can unpack elements from two inputs and /// then reduce the shuffle to a single-input (wider) permutation. static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; int NumHalfLaneElts = NumLaneElts / 2; bool MatchLo = true, MatchHi = true; SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; // Determine UNPCKL/UNPCKH type and operand order. for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { for (int Elt = 0; Elt != NumLaneElts; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; SDValue &Op = Ops[Elt & 1]; if (M < NumElts && (Op.isUndef() || Op == V1)) Op = V1; else if (NumElts <= M && (Op.isUndef() || Op == V2)) Op = V2; else return SDValue(); int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts; MatchLo &= isUndefOrInRange(M, Lo, Mid) || isUndefOrInRange(M, NumElts + Lo, NumElts + Mid); MatchHi &= isUndefOrInRange(M, Mid, Hi) || isUndefOrInRange(M, NumElts + Mid, NumElts + Hi); if (!MatchLo && !MatchHi) return SDValue(); } } assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"); // Now check that each pair of elts come from the same unpack pair // and set the permute mask based on each pair. // TODO - Investigate cases where we permute individual elements. SmallVector PermuteMask(NumElts, -1); for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { for (int Elt = 0; Elt != NumLaneElts; Elt += 2) { int M0 = Mask[Lane + Elt + 0]; int M1 = Mask[Lane + Elt + 1]; if (0 <= M0 && 0 <= M1 && (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts)) return SDValue(); if (0 <= M0) PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts)); if (0 <= M1) PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1; } } unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops); return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask); } /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then /// permuting the elements of the result in place. static SDValue lowerShuffleAsByteRotateAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || (VT.is256BitVector() && !Subtarget.hasAVX2()) || (VT.is512BitVector() && !Subtarget.hasBWI())) return SDValue(); // We don't currently support lane crossing permutes. if (is128BitLaneCrossingShuffleMask(VT, Mask)) return SDValue(); int Scale = VT.getScalarSizeInBits() / 8; int NumLanes = VT.getSizeInBits() / 128; int NumElts = VT.getVectorNumElements(); int NumEltsPerLane = NumElts / NumLanes; // Determine range of mask elts. bool Blend1 = true; bool Blend2 = true; std::pair Range1 = std::make_pair(INT_MAX, INT_MIN); std::pair Range2 = std::make_pair(INT_MAX, INT_MIN); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; if (M < NumElts) { Blend1 &= (M == (Lane + Elt)); assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); M = M % NumEltsPerLane; Range1.first = std::min(Range1.first, M); Range1.second = std::max(Range1.second, M); } else { M -= NumElts; Blend2 &= (M == (Lane + Elt)); assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); M = M % NumEltsPerLane; Range2.first = std::min(Range2.first, M); Range2.second = std::max(Range2.second, M); } } } // Bail if we don't need both elements. // TODO - it might be worth doing this for unary shuffles if the permute // can be widened. if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) || !(0 <= Range2.first && Range2.second < NumEltsPerLane)) return SDValue(); if (VT.getSizeInBits() > 128 && (Blend1 || Blend2)) return SDValue(); // Rotate the 2 ops so we can access both ranges, then permute the result. auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) { MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); SDValue Rotate = DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), DAG.getBitcast(ByteVT, Lo), DAG.getConstant(Scale * RotAmt, DL, MVT::i8))); SmallVector PermMask(NumElts, SM_SentinelUndef); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; if (M < NumElts) PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane); else PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane); } } return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask); }; // Check if the ranges are small enough to rotate from either direction. if (Range2.second < Range1.first) return RotateAndPermute(V1, V2, Range1.first, 0); if (Range1.second < Range2.first) return RotateAndPermute(V2, V1, Range2.first, NumElts); return SDValue(); } /// Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. static SDValue lowerShuffleAsDecomposedShuffleBlend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and // blend them together. SmallVector V1Mask(Mask.size(), -1); SmallVector V2Mask(Mask.size(), -1); SmallVector BlendMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= 0 && Mask[i] < Size) { V1Mask[i] = Mask[i]; BlendMask[i] = i; } else if (Mask[i] >= Size) { V2Mask[i] = Mask[i] - Size; BlendMask[i] = i + Size; } // Try to lower with the simpler initial blend/unpack/rotate strategies unless // one of the input shuffles would be a no-op. We prefer to shuffle inputs as // the shuffle may be able to fold with a load or other benefit. However, when // we'll have to do 2x as many shuffles in order to achieve this, a 2-input // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { // Only prefer immediate blends to unpack/rotate. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true)) return BlendPerm; if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) return UnpackPerm; if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( DL, VT, V1, V2, Mask, Subtarget, DAG)) return RotatePerm; // Unpack/rotate failed - try again with variable blends. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; } V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } /// Try to lower a vector shuffle as a rotation. /// /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef Mask) { int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] // [-1, -1, -1, -1, -1, -1, 1, 2] // [ 3, 4, 5, 6, 7, 8, 9, 10] // [-1, 4, 5, 6, -1, -1, 9, -1] // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; for (int i = 0; i < NumElts; ++i) { int M = Mask[i]; assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && "Unexpected mask index."); if (M < 0) continue; // Determine where a rotated vector would have started. int StartIdx = i - (M % NumElts); if (StartIdx == 0) // The identity rotation isn't interesting, stop. return -1; // If we found the tail of a vector the rotation must be the missing // front. If we found the head of a vector, it must be how much of the // head. int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx; if (Rotation == 0) Rotation = CandidateRotation; else if (Rotation != CandidateRotation) // The rotations don't match, so we can't match this mask. return -1; // Compute which value this mask is pointing at. SDValue MaskV = M < NumElts ? V1 : V2; // Compute which of the two target values this index should be assigned // to. This reflects whether the high elements are remaining or the low // elements are remaining. SDValue &TargetV = StartIdx < 0 ? Hi : Lo; // Either set up this value if we've not encountered it before, or check // that it remains consistent. if (!TargetV) TargetV = MaskV; else if (TargetV != MaskV) // This may be a rotation, but it pulls from the inputs in some // unsupported interleaving. return -1; } // Check that we successfully analyzed the mask, and normalize the results. assert(Rotation != 0 && "Failed to locate a viable rotation!"); assert((Lo || Hi) && "Failed to find a rotated input vector!"); if (!Lo) Lo = Hi; else if (!Hi) Hi = Lo; V1 = Lo; V2 = Hi; return Rotation; } /// Try to lower a vector shuffle as a byte rotation. /// /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will /// try to generically lower a vector shuffle through such an pattern. It /// does not check for the profitability of lowering either as PALIGNR or /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. /// This matches shuffle vectors that look like: /// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask) { // Don't accept any shuffles with zero elements. if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return -1; // PALIGNR works on 128-bit lanes. SmallVector RepeatedMask; if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) return -1; int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask); if (Rotation <= 0) return -1; // PALIGNR rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. int NumElts = RepeatedMask.size(); int Scale = 16 / NumElts; return Rotation * Scale; } static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); SDValue Lo = V1, Hi = V2; int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); if (ByteRotation <= 0) return SDValue(); // Cast the inputs to i8 vector of correct length to match PALIGNR or // PSLLDQ/PSRLDQ. MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); Lo = DAG.getBitcast(ByteVT, Lo); Hi = DAG.getBitcast(ByteVT, Hi); // SSSE3 targets can use the palignr instruction. if (Subtarget.hasSSSE3()) { assert((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, DAG.getConstant(ByteRotation, DL, MVT::i8))); } assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); assert(ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"); // Default SSE2 implementation int LoByteShift = 16 - ByteRotation; int HiByteShift = ByteRotation; SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, DAG.getConstant(LoByteShift, DL, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, DAG.getConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } /// Try to lower a vector shuffle as a dword/qword rotation. /// /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary /// rotation of the concatenation of two vectors; This routine will /// try to generically lower a vector shuffle through such an pattern. /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); // 128/256-bit vectors are only supported with VLX. assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; int Rotation = matchShuffleAsRotate(Lo, Hi, Mask); if (Rotation <= 0) return SDValue(); return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, DAG.getConstant(Rotation, DL, MVT::i8)); } /// Try to lower a vector shuffle as a byte shift sequence. static SDValue lowerVectorShuffleAsByteShiftMask( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); assert(VT.is128BitVector() && "Only 128-bit vectors supported"); // We need a shuffle that has zeros at one/both ends and a sequential // shuffle from one source within. unsigned ZeroLo = Zeroable.countTrailingOnes(); unsigned ZeroHi = Zeroable.countLeadingOnes(); if (!ZeroLo && !ZeroHi) return SDValue(); unsigned NumElts = Mask.size(); unsigned Len = NumElts - (ZeroLo + ZeroHi); if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) return SDValue(); unsigned Scale = VT.getScalarSizeInBits() / 8; ArrayRef StubMask = Mask.slice(ZeroLo, Len); if (!isUndefOrInRange(StubMask, 0, NumElts) && !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) return SDValue(); SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; Res = DAG.getBitcast(MVT::v16i8, Res); // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an // inner sequential set of elements, possibly offset: // 01234567 --> zzzzzz01 --> 1zzzzzzz // 01234567 --> 4567zzzz --> zzzzz456 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz if (ZeroLo == 0) { unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getConstant(Scale * ZeroHi, DL, MVT::i8)); } else if (ZeroHi == 0) { unsigned Shift = Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); } else if (!Subtarget.hasSSSE3()) { // If we don't have PSHUFB then its worth avoiding an AND constant mask // by performing 3 byte shifts. Shuffle combining can kick in above that. // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getConstant(Scale * Shift, DL, MVT::i8)); Shift += Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); } else return SDValue(); return DAG.getBitcast(VT, Res); } /// Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function /// matches elements from one of the input vectors shuffled to the left or /// right with zeroable elements 'shifted in'. It handles both the strictly /// bit-wise element shifts and the byte shift across an entire 128-bit double /// quad word lane. /// /// PSHL : (little-endian) left bit shift. /// [ zz, 0, zz, 2 ] /// [ -1, 4, zz, -1 ] /// PSRL : (little-endian) right bit shift. /// [ 1, zz, 3, zz] /// [ -1, -1, 7, zz] /// PSLLDQ : (little-endian) left byte shift /// [ zz, 0, 1, 2, 3, 4, 5, 6] /// [ zz, zz, -1, -1, 2, 3, 4, -1] /// [ zz, zz, zz, zz, zz, zz, -1, 1] /// PSRLDQ : (little-endian) right byte shift /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; auto CheckZeros = [&](int Shift, int Scale, bool Left) { for (int i = 0; i < Size; i += Scale) for (int j = 0; j < Shift; ++j) if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, int Scale, bool Left) { for (int i = 0; i != Size; i += Scale) { unsigned Pos = Left ? i + Shift : i; unsigned Low = Left ? i : i + Shift; unsigned Len = Scale - Shift; if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset)) return -1; } int ShiftEltBits = ScalarSizeInBits * Scale; bool ByteShift = ShiftEltBits > 64; Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1); // Normalize the scale for byte shifts to still produce an i64 element // type. Scale = ByteShift ? Scale / 2 : Scale; // We need to round trip through the appropriate type for the shift. MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) : MVT::getVectorVT(ShiftSVT, Size / Scale); return (int)ShiftAmt; }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just // keep doubling the size of the integer elements up to that. We can // then shift the elements of the integer vector by whole multiples of // their width within the elements of the larger integer vector. Test each // multiple to see if we can find a match with the moved element indices // and that the shifted in elements are all zeroable. unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128); for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2) for (int Shift = 1; Shift != Scale; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Scale, Left)) { int ShiftAmt = MatchShift(Shift, Scale, Left); if (0 < ShiftAmt) return ShiftAmt; } // no match return -1; } static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); MVT ShiftVT; SDValue V = V1; unsigned Opcode; // Try to match shuffle against V1 shift. int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); // If V1 failed, try to match shuffle against V2 shift. if (ShiftAmt < 0) { ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, Size, Zeroable, Subtarget); V = V2; } if (ShiftAmt < 0) return SDValue(); assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(Opcode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, DL, MVT::i8)); return DAG.getBitcast(VT, V); } // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefUpperHalf(Mask)) return false; // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); // Attempt to match first Len sequential elements from the lower half. SDValue Src; int Idx = -1; for (int i = 0; i != Len; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; SDValue &V = (M < Size ? V1 : V2); M = M % Size; // The extracted elements must start at a valid index and all mask // elements must be in the lower half. if (i > M || M >= HalfSize) return false; if (Idx < 0 || (Src == V && Idx == (M - i))) { Src = V; Idx = M - i; continue; } return false; } if (!Src || Idx < 0) return false; assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Src; return true; } // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. if (!isUndefUpperHalf(Mask)) return false; for (int Idx = 0; Idx != HalfSize; ++Idx) { SDValue Base; // Attempt to match first source from mask before insertion point. if (isUndefInRange(Mask, 0, Idx)) { /* EMPTY */ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { Base = V1; } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { Base = V2; } else { continue; } // Extend the extraction length looking to match both the insertion of // the second source and the remaining elements of the first. for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { SDValue Insert; int Len = Hi - Idx; // Match insertion. if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { Insert = V1; } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { Insert = V2; } else { continue; } // Match the remaining elements of the lower half. if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ } else if ((!Base || (Base == V1)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; } else if ((!Base || (Base == V2)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Size + Hi)) { Base = V2; } else { continue; } BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Base; V2 = Insert; return true; } } return false; } /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); return SDValue(); } /// Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. The extended elements are consecutive and /// begin and can start from an offsetted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int EltBits = VT.getScalarSizeInBits(); int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = 128 / EltBits; int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); assert(0 <= Offset && "Extension offset must be positive."); assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."); // Check that an index is in same lane as the base offset. auto SafeOffset = [&](int Idx) { return OffsetLane == (Idx / NumEltsPerLane); }; // Shift along an input so that the offset base moves to the first element. auto ShuffleOffset = [&](SDValue V) { if (!Offset) return V; SmallVector ShMask((unsigned)NumElements, -1); for (int i = 0; i * Scale < NumElements; ++i) { int SrcIdx = i + Offset; ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; } return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); }; // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. // TODO: Add AnyExt support. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {Offset / 2, -1, SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); assert(VT.is128BitVector() && "Unexpected vector width!"); int LoIdx = Offset * EltBits; SDValue Lo = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(LoIdx, DL, MVT::i8))); if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); int HiIdx = (Offset + 1) * EltBits; SDValue Hi = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(HiIdx, DL, MVT::i8))); return DAG.getBitcast(VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { int Idx = Offset + (i / Scale); PSHUFBMask[i] = DAG.getConstant( (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask))); } // If we are extending from an offset, ensure we start on a boundary that // we can unpack from. int AlignToUnpack = Offset % (NumElements / Scale); if (AlignToUnpack) { SmallVector ShMask((unsigned)NumElements, -1); for (int i = AlignToUnpack; i < NumElements; ++i) ShMask[i - AlignToUnpack] = i; InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); Offset -= AlignToUnpack; } // Otherwise emit a sequence of unpacks. do { unsigned UnpackLoHi = X86ISD::UNPCKL; if (Offset >= (NumElements / 2)) { UnpackLoHi = X86ISD::UNPCKH; Offset -= (NumElements / 2); } MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; } while (Scale > 1); return DAG.getBitcast(VT, InputV); } /// Try to lower a vector shuffle as a zero extension on any microarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't /// check for the profitability of this lowering, it tries to aggressively /// match this pattern. It will use all of the micro-architectural details it /// can to emit an efficient lowering. It handles both blends with all-zero /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to /// masking out later). /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. static SDValue lowerShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); // Define a helper function to check a particular ext-scale and lower to it if // valid. auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; int Offset = 0; int Matches = 0; for (int i = 0; i < NumElements; ++i) { int M = Mask[i]; if (M < 0) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. if (!Zeroable[i]) return SDValue(); // We no longer are in the anyext case. AnyExt = false; continue; } // Each of the base elements needs to be consecutive indices into the // same input vector. SDValue V = M < NumElements ? V1 : V2; M = M % NumElements; if (!InputV) { InputV = V; Offset = M - (i / Scale); } else if (InputV != V) return SDValue(); // Flip-flopping inputs. // Offset must start in the lowest 128-bit lane or at the start of an // upper lane. // FIXME: Is it ever worth allowing a negative base offset? if (!((0 <= Offset && Offset < NumEltsPerLane) || (Offset % NumEltsPerLane) == 0)) return SDValue(); // If we are offsetting, all referenced entries must come from the same // lane. if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) return SDValue(); if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. Matches++; } // If we fail to find an input, we have a zero-shuffle which should always // have already been handled. // FIXME: Maybe handle this here in case during blending we end up with one? if (!InputV) return SDValue(); // If we are offsetting, don't extend if we only match a single input, we // can always do better by using a basic PSHUF or PUNPCK. if (Offset != 0 && Matches < 2) return SDValue(); return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. assert(Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"); int NumExtElements = Bits / 64; // Each iteration, try extending the elements half as much, but into twice as // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } // General extends failed, but 128-bit vectors may be able to use MOVQ. if (Bits != 128) return SDValue(); // Returns one of the source operands if the shuffle can be reduced to a // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. auto CanZExtLowHalf = [&]() { for (int i = NumElements / 2; i != NumElements; ++i) if (!Zeroable[i]) return SDValue(); if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) return V1; if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) return V2; return SDValue(); }; if (SDValue V = CanZExtLowHalf()) { V = DAG.getBitcast(MVT::v2i64, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); return DAG.getBitcast(VT, V); } // No viable ext lowering found. return SDValue(); } /// Try to get a scalar value for a specific element of a vector. /// /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG) { MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); V = peekThroughBitcasts(V); // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { // Ensure the scalar operand is the same size as the destination. // FIXME: Add support for scalar truncation where possible. SDValue S = V.getOperand(Idx); if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) return DAG.getBitcast(EltVT, S); } return SDValue(); } /// Helper to test for a load that can be folded with x86 shuffles. /// /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { V = peekThroughBitcasts(V); return ISD::isNON_EXTLoad(V.getNode()); } /// Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); int V2Index = find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (i != V2Index && !Zeroable[i]) { IsV1Zeroable = false; break; } // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG); if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { // Using zext to expand a narrow element won't work for non-zero // insertions. if (!IsV1Zeroable) return SDValue(); // Zero-extend directly to i32. ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32); V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || EltVT == MVT::i16) { // Either not inserting from the low element of the input or the input // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); } if (!IsV1Zeroable) { // If V1 can't be treated as a zero vector we have fewer options to lower // this. We can't support integer vectors or non-zero targets cheaply, and // the V1 elements can't be permuted in any way. assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); if (!VT.isFloatingPoint() || V2Index != 0) return SDValue(); SmallVector V1Mask(Mask.begin(), Mask.end()); V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); if (!VT.is128BitVector()) return SDValue(); // Otherwise, use MOVSD or MOVSS. assert((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"); return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, ExtVT, V1, V2); } // This lowering only works for the low element with floating point vectors. if (VT.isFloatingPoint() && V2Index != 0) return SDValue(); V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getBitcast(VT, V2); if (V2Index != 0) { // If we have 4 or fewer lanes we can cheaply shuffle the element into // the desired position. Otherwise it is more efficient to do a vector // shift left. We know that we can do a vector shift left because all // the inputs are zero. if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { SmallVector V2Shuffle(Mask.size(), 1); V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getBitcast(MVT::v16i8, V2); V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v16i8, V2, DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8)); V2 = DAG.getBitcast(VT, V2); } } return V2; } /// Try to lower broadcast of a single - truncated - integer element, /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); EVT EltVT = VT.getVectorElementType(); EVT V0VT = V0.getValueType(); assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); EVT V0EltVT = V0VT.getVectorElementType(); if (!V0EltVT.isInteger()) return SDValue(); const unsigned EltSize = EltVT.getSizeInBits(); const unsigned V0EltSize = V0EltVT.getSizeInBits(); // This is only a truncation if the original element type is larger. if (V0EltSize <= EltSize) return SDValue(); assert(((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"); const unsigned V0Opc = V0.getOpcode(); const unsigned Scale = V0EltSize / EltSize; const unsigned V0BroadcastIdx = BroadcastIdx / Scale; if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && V0Opc != ISD::BUILD_VECTOR) return SDValue(); SDValue Scalar = V0.getOperand(V0BroadcastIdx); // If we're extracting non-least-significant bits, shift so we can truncate. // Hopefully, we can fold away the trunc/srl/load into the broadcast. // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. if (const int OffsetIdx = BroadcastIdx % Scale) Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8)); return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } /// Test whether this can be lowered with a single SHUFPS instruction. /// /// This is used to disable more specialized lowerings when the shufps lowering /// will happen to be efficient. static bool isSingleSHUFPSMask(ArrayRef Mask) { // This routine only handles 128-bit shufps. assert(Mask.size() == 4 && "Unsupported mask size!"); assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); // To lower with a single SHUFPS we need to have the low half and high half // each requiring a single input. if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) return false; if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) return false; return true; } /// If we are extracting two 128-bit halves of a vector and shuffling the /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a /// multi-shuffle lowering. static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef Mask, SelectionDAG &DAG) { EVT VT = N0.getValueType(); assert((VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"); // Check that both sources are extracts of the same source vector. if (!N0.hasOneUse() || !N1.hasOneUse() || N0.getOpcode() != ISD::EXTRACT_SUBVECTOR || N1.getOpcode() != ISD::EXTRACT_SUBVECTOR || N0.getOperand(0) != N1.getOperand(0)) return SDValue(); SDValue WideVec = N0.getOperand(0); EVT WideVT = WideVec.getValueType(); if (!WideVT.is256BitVector() || !isa(N0.getOperand(1)) || !isa(N1.getOperand(1))) return SDValue(); // Match extracts of each half of the wide source vector. Commute the shuffle // if the extract of the low half is N1. unsigned NumElts = VT.getVectorNumElements(); SmallVector NewMask(Mask.begin(), Mask.end()); const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1); const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1); if (ExtIndex1 == 0 && ExtIndex0 == NumElts) ShuffleVectorSDNode::commuteMask(NewMask); else if (ExtIndex0 != 0 || ExtIndex1 != NumElts) return SDValue(); // Final bailout: if the mask is simple, we are better off using an extract // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps // because that avoids a constant load from memory. if (NumElts == 4 && (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask))) return SDValue(); // Extend the shuffle mask with undef elements. NewMask.append(NumElts, -1); // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), NewMask); // This is free: ymm -> xmm. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf, DAG.getIntPtrConstant(0, DL)); } /// Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || (Subtarget.hasAVX() && VT.isFloatingPoint()) || (Subtarget.hasAVX2() && VT.isInteger()))) return SDValue(); // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); unsigned NumEltBits = VT.getScalarSizeInBits(); unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. int BroadcastIdx = -1; for (int i = 0; i != (int)NumElts; ++i) { SmallVector BroadcastMask(NumElts, i); if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) { BroadcastIdx = i; break; } } if (BroadcastIdx < 0) return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. int BitOffset = BroadcastIdx * NumEltBits; SDValue V = V1; for (;;) { switch (V.getOpcode()) { case ISD::BITCAST: { V = V.getOperand(0); continue; } case ISD::CONCAT_VECTORS: { int OpBitWidth = V.getOperand(0).getValueSizeInBits(); int OpIdx = BitOffset / OpBitWidth; V = V.getOperand(OpIdx); BitOffset %= OpBitWidth; continue; } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); auto ConstantIdx = dyn_cast(V.getOperand(2)); if (!ConstantIdx) break; int EltBitWidth = VOuter.getScalarValueSizeInBits(); int Idx = (int)ConstantIdx->getZExtValue(); int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements(); int BeginOffset = Idx * EltBitWidth; int EndOffset = BeginOffset + NumSubElts * EltBitWidth; if (BeginOffset <= BitOffset && BitOffset < EndOffset) { BitOffset -= BeginOffset; V = VInner; } else { V = VOuter; } continue; } } break; } assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"); BroadcastIdx = BitOffset / NumEltBits; // Do we need to bitcast the source to retrieve the original broadcast index? bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits; // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. // If the original value has a larger element type than the shuffle, the // broadcast element is in essence truncated. Make that explicit to ease // folding. if (BitCastSrc && VT.isInteger()) if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast( DL, VT, V, BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; MVT BroadcastVT = VT; // Also check the simpler case, where we can directly reuse the scalar. if (!BitCastSrc && ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) { V = V.getOperand(BroadcastIdx); // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); } else if (MayFoldLoad(V) && !cast(V)->isVolatile()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : Opcode; } // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. LoadSDNode *Ld = cast(V); SDValue BaseAddr = Ld->getOperand(1); EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); DAG.makeEquivalentMemoryOrdering(Ld, V); } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); } else if (BitOffset != 0) { // We can only broadcast from the zero-element of a vector register, // but it can be advantageous to broadcast from the zero-element of a // subvector. if (!VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); // VPERMQ/VPERMPD can perform the cross-lane shuffle directly. if (VT == MVT::v4f64 || VT == MVT::v4i64) return SDValue(); // Only broadcast the zero-element of a 128-bit subvector. if ((BitOffset % 128) != 0) return SDValue(); assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"); assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && "Unexpected vector size"); unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); V = extract128BitVector(V, ExtractIdx, DAG, DL); } if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, DAG.getBitcast(MVT::f64, V)); // Bitcast back to the same scalar type as BroadcastVT. if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) { assert(NumEltBits == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"); MVT ExtVT; if (V.getValueType().isVector()) { unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); } else { ExtVT = BroadcastVT.getScalarType(); } V = DAG.getBitcast(ExtVT, V); } // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) { V = DAG.getBitcast(MVT::f64, V); unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements(); BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); } // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to // 128-bits, removing as many bitcasts as possible. if (V.getValueSizeInBits() > 128) { MVT ExtVT = V.getSimpleValueType().getScalarType(); ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits()); V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); V = DAG.getBitcast(ExtVT, V); } return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } // Check for whether we can use INSERTPS to perform the shuffle. We only use // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Attempt to match INSERTPS with one element from VA or VB being // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask // are updated. auto matchAsInsertPS = [&](SDValue VA, SDValue VB, ArrayRef CandidateMask) { unsigned ZMask = 0; int VADstIndex = -1; int VBDstIndex = -1; bool VAUsedInPlace = false; for (int i = 0; i < 4; ++i) { // Synthesize a zero mask from the zeroable elements (includes undefs). if (Zeroable[i]) { ZMask |= 1 << i; continue; } // Flag if we use any VA inputs in place. if (i == CandidateMask[i]) { VAUsedInPlace = true; continue; } // We can only insert a single non-zeroable element. if (VADstIndex >= 0 || VBDstIndex >= 0) return false; if (CandidateMask[i] < 4) { // VA input out of place for insertion. VADstIndex = i; } else { // VB input for insertion. VBDstIndex = i; } } // Don't bother if we have no (non-zeroable) element for insertion. if (VADstIndex < 0 && VBDstIndex < 0) return false; // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. unsigned VBSrcIndex = 0; if (VADstIndex >= 0) { // If we have a VA input out of place, we use VA as the V2 element // insertion and don't use the original V2 at all. VBSrcIndex = CandidateMask[VADstIndex]; VBDstIndex = VADstIndex; VB = VA; } else { VBSrcIndex = CandidateMask[VBDstIndex] - 4; } // If no V1 inputs are used in place, then the result is created only from // the zero mask and the V2 insertion - so remove V1 dependency. if (!VAUsedInPlace) VA = DAG.getUNDEF(MVT::v4f32); // Update V1, V2 and InsertPSMask accordingly. V1 = VA; V2 = VB; // Insert the V2 element into the desired position. InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); return true; }; if (matchAsInsertPS(V1, V2, Mask)) return true; // Commute and try again. SmallVector CommutedMask(Mask.begin(), Mask.end()); ShuffleVectorSDNode::commuteMask(CommutedMask); if (matchAsInsertPS(V2, V1, CommutedMask)) return true; return false; } static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); // Attempt to match the insertps pattern. unsigned InsertPSMask; if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) return SDValue(); // Insert the V2 element into the desired position. return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getConstant(InsertPSMask, DL, MVT::i8)); } /// Try to lower a shuffle as a permute of the inputs followed by an /// UNPCK instruction. /// /// This specifically targets cases where we end up with alternating between /// the two inputs, and so can permute them into something that feeds a single /// UNPCK instruction. Note that this routine only targets integer vectors /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. static SDValue lowerShuffleAsPermuteAndUnpack( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "This routine only supports integer vectors."); assert(VT.is128BitVector() && "This routine only works on 128-bit vectors."); assert(!V2.isUndef() && "This routine should only be used when blending two inputs."); assert(Mask.size() >= 2 && "Single element masks are invalid."); int Size = Mask.size(); int NumLoInputs = count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; }); int NumHiInputs = count_if(Mask, [Size](int M) { return M % Size >= Size / 2; }); bool UnpackLo = NumLoInputs >= NumHiInputs; auto TryUnpack = [&](int ScalarSize, int Scale) { SmallVector V1Mask((unsigned)Size, -1); SmallVector V2Mask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; // Each element of the unpack contains Scale elements from this mask. int UnpackIdx = i / Scale; // We only handle the case where V1 feeds the first slots of the unpack. // We rely on canonicalization to ensure this is the case. if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) return SDValue(); // Setup the mask for this input. The indexing is tricky as we have to // handle the unpack stride. SmallVectorImpl &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size; } // If we will have to shuffle both inputs to use the unpack, check whether // we can just unpack first and shuffle the result. If so, skip this unpack. if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) return SDValue(); // Shuffle the inputs into place. V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); // Cast the inputs to the type we will use to unpack them. MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale); V1 = DAG.getBitcast(UnpackVT, V1); V2 = DAG.getBitcast(UnpackVT, V2); // Unpack the inputs and cast the result back to the desired type. return DAG.getBitcast( VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, UnpackVT, V1, V2)); }; // We try each unpack from the largest to the smallest to try and find one // that fits this mask. int OrigScalarSize = VT.getScalarSizeInBits(); for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize)) return Unpack; // If we're shuffling with a zero vector then we're better off not doing // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements. if (ISD::isBuildVectorAllZeros(V1.getNode()) || ISD::isBuildVectorAllZeros(V2.getNode())) return SDValue(); // If none of the unpack-rooted lowerings worked (or were profitable) try an // initial unpack. if (NumLoInputs == 0 || NumHiInputs == 0) { assert((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"); int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; // FIXME: We could consider the total complexity of the permute of each // possible unpacking. Or at the least we should consider how many // half-crossings are created. // FIXME: We could consider commuting the unpacks. SmallVector PermMask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); PermMask[i] = 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); } return DAG.getVectorShuffle( VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT, V1, V2), DAG.getUNDEF(VT), PermMask); } return SDValue(); } /// Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full /// support for floating point shuffles but not integer shuffles. These /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } return DAG.getNode( X86ISD::SHUFP, DL, MVT::v2f64, Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || isShuffleEquivalent(V1, V2, Mask, {1, 3})) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. return DAG.getNode( X86ISD::MOVSD, DL, MVT::v2f64, V2, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget.hasSSE41()) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } /// Handle lowering of 2-lane 64-bit integer shuffles. /// /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); int WidenedMask[4] = { std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); } assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. V1 = DAG.getBitcast(MVT::v2f64, V1); V2 = DAG.getBitcast(MVT::v2f64, V2); return DAG.getBitcast(MVT::v2i64, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 1) { int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; if (Mask[V2AdjIndex] < 0) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. if (V2Index < 2) std::swap(LowV, HighV); NewMask[V2Index] -= 4; } else { // Handle the case where the V2 element ends up adjacent to a V1 element. // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. if (V2Index < 2) { LowV = V2; HighV = V1; } else { HighV = V2; } NewMask[V1Index] = 2; // We put the V1 element in V2[2]. NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; } else if (Mask[2] < 4 && Mask[3] < 4) { // We also handle the reversed case because this utility may get called // when we detect a SHUFPS pattern but can't easily commute the shuffle to // arrange things in the right direction. NewMask[0] -= 4; NewMask[1] -= 4; HighV = V1; LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final // shuffle to place them. // The first two blend mask elements are for V1, the second two are for // V2. int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. LowV = HighV = V1; NewMask[0] = Mask[0] < 4 ? 0 : 2; NewMask[1] = Mask[0] < 4 ? 2 : 0; NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); } /// Lower 4-lane 32-bit floating point shuffles. /// /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. if (Subtarget.hasSSE3()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid // in SSE1 because otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1})) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1); if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3})) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1); } // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG)) return BlendPerm; } // Use low/high mov instructions. These are only valid in SSE1 because // otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7})) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions // but we aren't actually going to use the UNPCK instruction because doing // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) Mask = UnpackLoMask; else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (!isSingleSHUFPSMask(Mask)) { // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Unpack; } // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would incur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2); SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask); return DAG.getBitcast(MVT::v4i32, ShufPS); } /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 /// shuffle lowering, and the most complex part. /// /// The lowering strategy is to try to form pairs of input lanes which are /// targeted at the same half of the final vector, and then use a dword shuffle /// to place them onto the right half, and finally unpack the paired lanes into /// their final position. /// /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. /// /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputShuffle( const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doesn't match!"); MutableArrayRef LoMask = Mask.slice(0, 4); MutableArrayRef HiMask = Mask.slice(4, 4); // Attempt to directly match PSHUFLW or PSHUFHW. if (isUndefOrInRange(LoMask, 0, 4) && isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); } if (isUndefOrInRange(HiMask, 4, 8) && isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { for (int i = 0; i != 4; ++i) HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4)); return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); } SmallVector LoInputs; copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); array_pod_sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); // If we are shuffling values from one half - check how many different DWORD // pairs we need to create. If only 1 or 2 then we can perform this as a // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below. auto ShuffleDWordPairs = [&](ArrayRef PSHUFHalfMask, ArrayRef PSHUFDMask, unsigned ShufWOp) { V = DAG.getNode(ShufWOp, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); V = DAG.getBitcast(PSHUFDVT, V); V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V, getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); return DAG.getBitcast(VT, V); }; if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) { int PSHUFDMask[4] = { -1, -1, -1, -1 }; SmallVector, 4> DWordPairs; int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2); // Collect the different DWORD pairs. for (int DWord = 0; DWord != 4; ++DWord) { int M0 = Mask[2 * DWord + 0]; int M1 = Mask[2 * DWord + 1]; M0 = (M0 >= 0 ? M0 % 4 : M0); M1 = (M1 >= 0 ? M1 % 4 : M1); if (M0 < 0 && M1 < 0) continue; bool Match = false; for (int j = 0, e = DWordPairs.size(); j < e; ++j) { auto &DWordPair = DWordPairs[j]; if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) && (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) { DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first); DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second); PSHUFDMask[DWord] = DOffset + j; Match = true; break; } } if (!Match) { PSHUFDMask[DWord] = DOffset + DWordPairs.size(); DWordPairs.push_back(std::make_pair(M0, M1)); } } if (DWordPairs.size() <= 2) { DWordPairs.resize(2, std::make_pair(-1, -1)); int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second, DWordPairs[1].first, DWordPairs[1].second}; if ((NumHToL + NumHToH) == 0) return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW); if ((NumLToL + NumLToH) == 0) return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW); } } // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through // to the generic code below. For example: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half // and an existing 2-into-2 on the other half. In this case we may have to // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. // Fortunately, we don't have to handle anything but a 2-into-2 pattern // because any other situation (including a 3-into-1 or 1-into-3 in the other // half than the one we target for fixing) will be fixed when we re-enter this // path. We will also combine away any sequence of PSHUFD instructions that // result into a single instruction. Here is an example of the tricky case: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] // // This now has a 1-into-3 in the high half! Instead, we do two shuffles: // // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] // // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] // // The result is fine to be handled by the generic logic. auto balanceSides = [&](ArrayRef AToAInputs, ArrayRef BToAInputs, ArrayRef BToBInputs, ArrayRef AToBInputs, int AOffset, int BOffset) { assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."); assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."); assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); bool ThreeAInputs = AToAInputs.size() == 3; // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord = 0, BDWord = 0; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; ArrayRef TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); TripleDWord = TripleNonInputIdx / 2; // We use xor with one to compute the adjacent DWord to whichever one the // OneInput is in. OneInputDWord = (OneInput / 2) ^ 1; // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA // and BToA inputs. If there is also such a problem with the BToB and AToB // inputs, we don't try to fix it necessarily -- we'll recurse and see it in // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it // is essential that we don't *create* a 3<-1 as then we might oscillate. if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { // Compute how many inputs will be flipped by swapping these DWords. We // need // to balance this to ensure we don't form a 3-1 shuffle in the other // half. int NumFlippedAToBInputs = std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); int NumFlippedBToBInputs = std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); if ((NumFlippedAToBInputs == 1 && (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || (NumFlippedBToBInputs == 1 && (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { // We choose whether to fix the A half or B half based on whether that // half has zero flipped inputs. At zero, we may not be able to fix it // with that half. We also bias towards fixing the B half because that // will more commonly be the high half, and we have to bias one way. auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, ArrayRef Inputs) { int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1); // Determine whether the free index is in the flipped dword or the // unflipped dword based on where the pinned index is. We use this bit // in an xor to conditionally select the adjacent dword. int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); if (IsFixIdxInput == IsFixFreeIdxInput) FixFreeIdx += 1; IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); assert(IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); V = DAG.getNode( FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) if (M >= 0 && M == FixIdx) M = FixFreeIdx; else if (M >= 0 && M == FixFreeIdx) M = FixIdx; }; if (NumFlippedBToBInputs != 0) { int BPinnedIdx = BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } } int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) if (M >= 0 && M/2 == ADWord) M = 2 * BDWord + M % 2; else if (M >= 0 && M/2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and // those dwords can then be moved to the correct half with a dword shuffle. // We use at most one low and one high word shuffle to collect these paired // inputs into dwords, and finally a dword shuffle to place them. int PSHUFLMask[4] = {-1, -1, -1, -1}; int PSHUFHMask[4] = {-1, -1, -1, -1}; int PSHUFDMask[4] = {-1, -1, -1, -1}; // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. auto fixInPlaceInputs = [&PSHUFDMask](ArrayRef InPlaceInputs, ArrayRef IncomingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } if (IncomingInputs.empty()) { // Just fix all of the in place inputs. for (int Input : InPlaceInputs) { SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; PSHUFDMask[Input / 2] = Input / 2; } return; } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; // Put the second input next to the first so that they are packed into // a dword. We find the adjacent index by toggling the low bit. int AdjIndex = InPlaceInputs[0] ^ 1; SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, MutableArrayRef FinalSourceHalfMask, int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word; }; auto isDWordClobbered = [&isWordClobbered](ArrayRef SourceHalfMask, int Word) { int LowWord = Word & ~1; int HighWord = Word | 1; return isWordClobbered(SourceHalfMask, LowWord) || isWordClobbered(SourceHalfMask, HighWord); }; if (IncomingInputs.empty()) return; if (ExistingInputs.empty()) { // Map any dwords with inputs from them into the right half. for (int Input : IncomingInputs) { // If the source half mask maps over the inputs, turn those into // swaps and use the swapped lane. if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) { SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; } else { assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"); } // Note that this correctly re-maps both when we do a swap and when // we observe the other side of the swap above. We rely on that to // avoid swapping the members of the input list directly. Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; } // Map the input's dword into the correct half. if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0) PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; else assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"); } // And just directly shift any other-half mask elements to be same-half // as we will have mirrored the dword containing the element into the // same position within that half. for (int &M : HalfMask) if (M >= SourceOffset && M < SourceOffset + 4) { M = M - SourceOffset + DestOffset; assert(M >= 0 && "This should never wrap below zero!"); } return; } // Ensure we have the input in a viable dword of its current half. This // is particularly tricky because the original position may be clobbered // by inputs being moved and *staying* in that half. if (IncomingInputs.size() == 1) { if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) + SourceOffset; SourceHalfMask[InputFixed - SourceOffset] = IncomingInputs[0] - SourceOffset; std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], InputFixed); IncomingInputs[0] = InputFixed; } } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { // We have two non-adjacent or clobbered inputs we need to extract from // the source half. To do this, we need to map them into some adjacent // dword slot in the source mask. int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, IncomingInputs[1] - SourceOffset}; // If there is a free slot in the source half mask adjacent to one of // the inputs, place the other input in it. We use (Index XOR 1) to // compute an adjacent index. if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && SourceHalfMask[InputsFixed[0] ^ 1] < 0) { SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; InputsFixed[1] = InputsFixed[0] ^ 1; } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && SourceHalfMask[InputsFixed[1] ^ 1] < 0) { SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; InputsFixed[0] = InputsFixed[1] ^ 1; } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 && SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) { // The two inputs are in the same DWord but it is clobbered and the // adjacent DWord isn't used at all. Move both inputs to the free // slot. SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; } else { // The only way we hit this point is if there is no clobbering // (because there are no off-half inputs to this half) and there is no // free slot adjacent to one of the inputs. In this case, we have to // swap an input with a non-input. for (int i = 0; i < 4; ++i) assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && "We can't handle any clobbers here!"); assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"); SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; // We also have to update the final source mask in this case because // it may need to undo the above swap. for (int &M : FinalSourceHalfMask) if (M == (InputsFixed[0] ^ 1) + SourceOffset) M = InputsFixed[1] + SourceOffset; else if (M == InputsFixed[1] + SourceOffset) M = (InputsFixed[0] ^ 1) + SourceOffset; InputsFixed[1] = InputsFixed[0] ^ 1; } // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) M = InputsFixed[1] + SourceOffset; IncomingInputs[0] = InputsFixed[0] + SourceOffset; IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); } // Now hoist the DWord down to the right half. int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; for (int &M : HalfMask) for (int Input : IncomingInputs) if (M == Input) M = FreeDWord * 2 + Input % 2; }; moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"); assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"); // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); // Do a half shuffle with the high mask after shifting its values down. for (int &M : HiMask) if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); return V; } /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the /// blend if only one input is used. static SDValue lowerShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { assert(!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"); int NumBytes = VT.getSizeInBits() / 8; int Size = Mask.size(); int Scale = NumBytes / Size; SmallVector V1Mask(NumBytes, DAG.getUNDEF(MVT::i8)); SmallVector V2Mask(NumBytes, DAG.getUNDEF(MVT::i8)); V1InUse = false; V2InUse = false; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Scale]; if (M < 0) continue; const int ZeroMask = 0x80; int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask; int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale; if (Zeroable[i / Scale]) V1Idx = V2Idx = ZeroMask; V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); V1InUse |= (ZeroMask != V1Idx); V2InUse |= (ZeroMask != V2Idx); } MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes); if (V1InUse) V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1), DAG.getBuildVector(ShufVT, DL, V1Mask)); if (V2InUse) V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2), DAG.getBuildVector(ShufVT, DL, V2Mask)); // If we need shuffled inputs from both, blend the two. SDValue V; if (V1InUse && V2InUse) V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2); else V = V1InUse ? V1 : V2; // Cast the result back to the correct type. return DAG.getBitcast(VT, V); } /// Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with /// two inputs. The single input shuffles are immediately delegated to /// a dedicated lowering routine. /// /// The blends are lowered in one of three fundamental ways. If there are few /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle /// of the input is significantly cheaper when lowered as an interleaving of /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable, Subtarget, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, Subtarget, DAG)) return Rotate; // Make a copy of the mask so it can be modified. SmallVector MutableMask(Mask.begin(), Mask.end()); return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask, Subtarget, DAG); } assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles."); // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; // Try to use byte shift instructions to mask. if (SDValue V = lowerVectorShuffleAsByteShiftMask( DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends. return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG); } /// Check whether a compaction lowering can be done by dropping even /// elements and compute how many times even elements must be dropped. /// /// This handles shuffles which take every Nth element where N is a power of /// two. Example shuffle masks: /// /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 /// /// Any of these lanes can of course be undef. /// /// This routine only supports N <= 3. /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here /// for larger N. /// /// \returns N above, or the number of times even elements must be dropped if /// there is such a number. Otherwise returns zero. static int canLowerByDroppingEvenElements(ArrayRef Mask, bool IsSingleInput) { // The modulus for the shuffle vector entries is based on whether this is // a single input or not. int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); assert(isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"); uint64_t ModMask = (uint64_t)ShuffleModulus - 1; // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, // and 2^3 simultaneously. This is because we may have ambiguity with // partially undef inputs. bool ViableForN[3] = {true, true, true}; for (int i = 0, e = Mask.size(); i < e; ++i) { // Ignore undef lanes, we'll optimistically collapse them to the pattern we // want. if (Mask[i] < 0) continue; bool IsAnyViable = false; for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) { uint64_t N = j + 1; // The shuffle mask must be equal to (i * 2^N) % M. if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) IsAnyViable = true; else ViableForN[j] = false; } // Early exit if we exhaust the possible powers of two. if (!IsAnyViable) break; } for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) return j + 1; // Return 0 as there is no viable power of two. return 0; } static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); } /// Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to /// detect any complexity reducing interleaving. If that doesn't help, it uses /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use a zext lowering. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG)) return V; int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies // things significantly. Currently, this means we need to be able to // express the pre-duplication shuffle as an i16 shuffle. // // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef Mask) { for (int i = 0; i < 16; i += 2) if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1]) return false; return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector LoInputs; copy_if(Mask, std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); array_pod_sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); bool TargetLo = LoInputs.size() >= HiInputs.size(); ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap LaneMap; for (int I : InPlaceInputs) { PreDupI16Shuffle[I/2] = I/2; LaneMap[I] = I; } int j = TargetLo ? 0 : 4, je = j + 4; for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { // Check if j is already a shuffle of this input. This happens when // there are two adjacent bytes after we move the low one. if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { // If we haven't yet mapped the input, search for a slot into which // we can map it. while (j < je && PreDupI16Shuffle[j] >= 0) ++j; if (j == je) // We can't place the inputs into a single half with a simple i16 shuffle, so bail. return SDValue(); // Map this input with the i16 shuffle. PreDupI16Shuffle[j] = MovingInputs[i] / 2; } // Update the lane map based on the mapping we ended up with. LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } V1 = DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, V1, V1); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) { int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); if (PostDupI16Shuffle[i / 2] < 0) PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"); } return DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); }; if (SDValue V = tryToWidenViaDuplication()) return V; } if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Try to use byte shift instructions to mask. if (SDValue V = lowerVectorShuffleAsByteShiftMask( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any // blends but after all of the single-input lowerings. If the single input // lowerings can find an instruction sequence that is faster than a PSHUFB, we // want to preserve that and we can DAG combine any longer sequences into // a PSHUFB in the end. But once we start blending from multiple inputs, // the complexity of DAG combining bad patterns back into PSHUFB is too high, // and there are *very* few patterns that would actually be faster than the // PSHUFB approach because of its ability to zero lanes. // // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. if (Subtarget.hasSSSE3()) { bool V1InUse = false; bool V2InUse = false; SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs( DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, // do so. This avoids using them to handle blends-with-zero which is // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { if (Subtarget.hasSSE41()) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // We can use an unpack to do the blending rather than an or in some // cases. Even though the or may be (very minorly) more efficient, we // preference this lowering because there are common cases where part of // the complexity of the shuffles goes away when we do the final blend as // an unpack. // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Unpack; // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. if (SDValue V = lowerShuffleAsByteRotateAndPermute( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return V; } return PSHUFB; } // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) return Blend; // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. // // We special case these as they can be particularly efficiently handled with // the PACKUSB instruction on x86 and they show up in common patterns of // rearranging bytes to truncate wide elements. bool IsSingleInput = V2.isUndef(); if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) { // NumEvenDrops is the power of two stride of the elements. Another way of // thinking about it is that we need to drop the even elements this many // times to get the original input. // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."); // We use the mask type to pick which bytes are preserved based on how many // elements are dropped. MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; SDValue ByteClearMask = DAG.getBitcast( MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); // Now pack things back together. V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2); SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); for (int i = 1; i < NumEvenDrops; ++i) { Result = DAG.getBitcast(MVT::v8i16, Result); Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); } return Result; } // Handle multi-input cases by blending single-input shuffles. if (NumV2Elements > 0) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 // vectors with unpacks, shuffles those, and then pulls them back together // with a pack. SDValue V = V1; std::array LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; std::array HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; SDValue VLoHalf, VHiHalf; // Check if any of the odd lanes in the v16i8 are used. If not, we can mask // them out and avoid using UNPCK{L,H} to extract the elements of V as // i16s. if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) && none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) { // Use a mask to drop the high bytes. VLoHalf = DAG.getBitcast(MVT::v8i16, V); VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, DAG.getConstant(0x00FF, DL, MVT::v8i16)); // This will be a single vector shuffle instead of a blend so nuke VHiHalf. VHiHalf = DAG.getUNDEF(MVT::v8i16); // Squash the masks to point directly into VLoHalf. for (int &M : LoBlendMask) if (M >= 0) M /= 2; for (int &M : HiBlendMask) if (M >= 0) M /= 2; } else { // Otherwise just unpack the low half of V into VLoHalf and the high half into // VHiHalf so that we can blend them as i16s. SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL); VLoHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); VHiHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); } SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } /// Dispatching routine to lower various 128-bit x86 vector shuffles. /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v2f64: return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i32: return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4f32: return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i16: return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i8: return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); } } /// Generic routine to split vector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); assert(V2.getSimpleValueType() == VT && "Bad operand type!"); ArrayRef LoMask = Mask.slice(0, Mask.size() / 2); ArrayRef HiMask = Mask.slice(Mask.size() / 2); int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; MVT ScalarVT = VT.getVectorElementType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); // Rather than splitting build-vectors, just build two narrower build // vectors. This helps shuffling with splats and zeros. auto SplitVector = [&](SDValue V) { V = peekThroughBitcasts(V); MVT OrigVT = V.getSimpleValueType(); int OrigNumElements = OrigVT.getVectorNumElements(); int OrigSplitNumElements = OrigNumElements / 2; MVT OrigScalarVT = OrigVT.getVectorElementType(); MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); SDValue LoV, HiV; auto *BV = dyn_cast(V); if (!BV) { LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, DAG.getIntPtrConstant(0, DL)); HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, DAG.getIntPtrConstant(OrigSplitNumElements, DL)); } else { SmallVector LoOps, HiOps; for (int i = 0; i < OrigSplitNumElements; ++i) { LoOps.push_back(BV->getOperand(i)); HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); } LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps); HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps); } return std::make_pair(DAG.getBitcast(SplitVT, LoV), DAG.getBitcast(SplitVT, HiV)); }; SDValue LoV1, HiV1, LoV2, HiV2; std::tie(LoV1, HiV1) = SplitVector(V1); std::tie(LoV2, HiV2) = SplitVector(V2); // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef HalfMask) { bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; SmallVector V1BlendMask((unsigned)SplitNumElements, -1); SmallVector V2BlendMask((unsigned)SplitNumElements, -1); SmallVector BlendMask((unsigned)SplitNumElements, -1); for (int i = 0; i < SplitNumElements; ++i) { int M = HalfMask[i]; if (M >= NumElements) { if (M >= NumElements + SplitNumElements) UseHiV2 = true; else UseLoV2 = true; V2BlendMask[i] = M - NumElements; BlendMask[i] = SplitNumElements + i; } else if (M >= 0) { if (M >= SplitNumElements) UseHiV1 = true; else UseLoV1 = true; V1BlendMask[i] = M; BlendMask[i] = i; } } // Because the lowering happens after all combining takes place, we need to // manually combine these blend masks as much as possible so that we create // a minimal number of high-level vector shuffle nodes. // First try just blending the halves of V1 or V2. if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) return DAG.getUNDEF(SplitVT); if (!UseLoV2 && !UseHiV2) return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); if (!UseLoV1 && !UseHiV1) return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); SDValue V1Blend, V2Blend; if (UseLoV1 && UseHiV1) { V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); } else { // We only use half of V1 so map the usage down into the final blend mask. V1Blend = UseLoV1 ? LoV1 : HiV1; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); } if (UseLoV2 && UseHiV2) { V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); } else { // We only use half of V2 so map the usage down into the final blend mask. V2Blend = UseLoV2 ? LoV2 : HiV2; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= SplitNumElements) BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); } return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); }; SDValue Lo = HalfBlend(LoMask); SDValue Hi = HalfBlend(HiMask); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } /// Either split a vector in halves or decompose the shuffles and the /// blend. /// /// This is provided as a good fallback for many lowerings of non-single-input /// shuffles with more than one 128-bit lane. In those cases, we want to select /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself."); int Size = Mask.size(); // If this can be modeled as a broadcast of two elements followed by a blend, // prefer that lowering. This is especially important because broadcasts can // often fold with memory operands. auto DoBothBroadcast = [&] { int V1BroadcastIdx = -1, V2BroadcastIdx = -1; for (int M : Mask) if (M >= Size) { if (V2BroadcastIdx < 0) V2BroadcastIdx = M - Size; else if (M - Size != V2BroadcastIdx) return false; } else if (M >= 0) { if (V1BroadcastIdx < 0) V1BroadcastIdx = M; else if (M != V1BroadcastIdx) return false; } return true; }; if (DoBothBroadcast()) return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to // unusually few instructions. int LaneCount = VT.getSizeInBits() / 128; int LaneSize = Size / LaneCount; SmallBitVector LaneInputs[2]; LaneInputs[0].resize(LaneCount, false); LaneInputs[1].resize(LaneCount, false); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); // Otherwise, just fall back to decomposed shuffles and a blend. This requires // that the decomposed single-input shuffles don't end up here. return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget, DAG); } /// Lower a vector shuffle crossing multiple 128-bit lanes as /// a lane permutation followed by a per-lane permutation. /// /// This is mainly for cases where we can have non-repeating permutes /// in each lane. /// /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask, /// we should investigate merging them. static SDValue lowerShuffleAsLanePermuteAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; SmallVector SrcLaneMask(NumLanes, SM_SentinelUndef); SmallVector PermMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; if (M < 0) continue; // Ensure that each lane comes from a single source lane. int SrcLane = M / NumEltsPerLane; int DstLane = i / NumEltsPerLane; if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane)) return SDValue(); SrcLaneMask[DstLane] = SrcLane; PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane); } // Make sure we set all elements of the lane mask, to avoid undef propagation. SmallVector LaneMask(NumElts, SM_SentinelUndef); for (int DstLane = 0; DstLane != NumLanes; ++DstLane) { int SrcLane = SrcLaneMask[DstLane]; if (0 <= SrcLane) for (int j = 0; j != NumEltsPerLane; ++j) { LaneMask[(DstLane * NumEltsPerLane) + j] = (SrcLane * NumEltsPerLane) + j; } } // If we're only shuffling a single lowest lane and the rest are identity // then don't bother. // TODO - isShuffleMaskInputInPlace could be extended to something like this. int NumIdentityLanes = 0; bool OnlyShuffleLowestLane = true; for (int i = 0; i != NumLanes; ++i) { if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane, i * NumEltsPerLane)) NumIdentityLanes++; else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes) OnlyShuffleLowestLane = false; } if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1)) return SDValue(); SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask); return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask); } /// Lower a vector shuffle crossing multiple 128-bit lanes as /// a permutation and blend of those lanes. /// /// This essentially blends the out-of-lane inputs to each lane into the lane /// from a permuted copy of the vector. This lowering strategy results in four /// instructions in the worst case for a single-input cross lane shuffle which /// is lower than any other fully general cross-lane shuffle strategy I'm aware /// of. Special cases for each particular shuffle pattern should be handled /// prior to trying this lowering. static SDValue lowerShuffleAsLanePermuteAndBlend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int Size = Mask.size(); int LaneSize = Size / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. if (!Subtarget.hasAVX2()) { bool LaneCrossing[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } else { bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneUsed[(Mask[i] / LaneSize)] = true; if (!LaneUsed[0] || !LaneUsed[1]) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } assert(V2.isUndef() && "This last part of this routine only works on single input shuffles"); SmallVector FlippedBlendMask(Size); for (int i = 0; i < Size; ++i) FlippedBlendMask[i] = Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) ? Mask[i] : Mask[i] % LaneSize + (i / LaneSize) * LaneSize + Size); // Flip the vector, and blend the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Flipped = DAG.getBitcast(PVT, V1); Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), { 2, 3, 0, 1 }); Flipped = DAG.getBitcast(VT, Flipped); return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); } /// Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding. if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); SmallVector WidenedMask; if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask)) return SDValue(); bool IsLowZero = (Zeroable & 0x3) == 0x3; bool IsHighZero = (Zeroable & 0xc) == 0xc; // Try to use an insert into a zero vector. if (WidenedMask[0] == 0 && IsHighZero) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), LoV, DAG.getIntPtrConstant(0, DL)); } // TODO: If minimizing size and one of the inputs is a zero vector and the // the zero vector has only one use, we could use a VPERM2X128 to save the // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // If either input operand is a zero vector, use VPERM2X128 because its mask // allows us to replace the zero input with an implicit zero. if (!IsLowZero && !IsHighZero) { // Check for patterns which can be matched with a single insert of a 128-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise, // this will likely become vinsertf128 which can't fold a 256-bit memop. if (!isa(peekThroughBitcasts(V1))) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec, DAG.getIntPtrConstant(2, DL)); } } // Try to use SHUF128 if possible. if (Subtarget.hasVLX()) { if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { unsigned PermMask = ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1); return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, DAG.getConstant(PermMask, DL, MVT::i8)); } } } // Otherwise form a 128-bit permutation. After accounting for undefs, // convert the 64-bit shuffle mask selection values into 128-bit // selection bits by dividing the indexes by 2 and shifting into positions // defined by a vperm2*128 instruction's immediate control byte. // The immediate permute control byte looks like this: // [1:0] - select 128 bits from sources for low half of destination // [2] - ignore // [3] - zero low half of destination // [5:4] - select 128 bits from sources for high half of destination // [6] - ignore // [7] - zero high half of destination assert((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?"); unsigned PermMask = 0; PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0); PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4); // Check the immediate mask and replace unused sources with undef. if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00) V1 = DAG.getUNDEF(VT); if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20) V2 = DAG.getUNDEF(VT); return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getConstant(PermMask, DL, MVT::i8)); } /// Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// /// This attempts to create a repeated lane shuffle where each lane uses one /// or two of the lanes of the inputs. The lanes of the input vectors are /// shuffled in one or two independent shuffles to get the lanes into the /// position needed by the final shuffle. static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This is only useful with multiple inputs."); if (is128BitLaneRepeatedShuffleMask(VT, Mask)) return SDValue(); int Size = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int LaneSize = 128 / VT.getScalarSizeInBits(); SmallVector RepeatMask(LaneSize, -1); SmallVector, 2> LaneSrcs(NumLanes, {{-1, -1}}); // First pass will try to fill in the RepeatMask from lanes that need two // sources. for (int Lane = 0; Lane != NumLanes; ++Lane) { int Srcs[2] = { -1, -1 }; SmallVector InLaneMask(LaneSize, -1); for (int i = 0; i != LaneSize; ++i) { int M = Mask[(Lane * LaneSize) + i]; if (M < 0) continue; // Determine which of the possible input lanes (NumLanes from each source) // this element comes from. Assign that as one of the sources for this // lane. We can assign up to 2 sources for this lane. If we run out // sources we can't do anything. int LaneSrc = M / LaneSize; int Src; if (Srcs[0] < 0 || Srcs[0] == LaneSrc) Src = 0; else if (Srcs[1] < 0 || Srcs[1] == LaneSrc) Src = 1; else return SDValue(); Srcs[Src] = LaneSrc; InLaneMask[i] = (M % LaneSize) + Src * Size; } // If this lane has two sources, see if it fits with the repeat mask so far. if (Srcs[1] < 0) continue; LaneSrcs[Lane][0] = Srcs[0]; LaneSrcs[Lane][1] = Srcs[1]; auto MatchMasks = [](ArrayRef M1, ArrayRef M2) { assert(M1.size() == M2.size() && "Unexpected mask size"); for (int i = 0, e = M1.size(); i != e; ++i) if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i]) return false; return true; }; auto MergeMasks = [](ArrayRef Mask, MutableArrayRef MergedMask) { assert(Mask.size() == MergedMask.size() && "Unexpected mask size"); for (int i = 0, e = MergedMask.size(); i != e; ++i) { int M = Mask[i]; if (M < 0) continue; assert((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"); MergedMask[i] = M; } }; if (MatchMasks(InLaneMask, RepeatMask)) { // Merge this lane mask into the final repeat mask. MergeMasks(InLaneMask, RepeatMask); continue; } // Didn't find a match. Swap the operands and try again. std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]); ShuffleVectorSDNode::commuteMask(InLaneMask); if (MatchMasks(InLaneMask, RepeatMask)) { // Merge this lane mask into the final repeat mask. MergeMasks(InLaneMask, RepeatMask); continue; } // Couldn't find a match with the operands in either order. return SDValue(); } // Now handle any lanes with only one source. for (int Lane = 0; Lane != NumLanes; ++Lane) { // If this lane has already been processed, skip it. if (LaneSrcs[Lane][0] >= 0) continue; for (int i = 0; i != LaneSize; ++i) { int M = Mask[(Lane * LaneSize) + i]; if (M < 0) continue; // If RepeatMask isn't defined yet we can define it ourself. if (RepeatMask[i] < 0) RepeatMask[i] = M % LaneSize; if (RepeatMask[i] < Size) { if (RepeatMask[i] != M % LaneSize) return SDValue(); LaneSrcs[Lane][0] = M / LaneSize; } else { if (RepeatMask[i] != ((M % LaneSize) + Size)) return SDValue(); LaneSrcs[Lane][1] = M / LaneSize; } } if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0) return SDValue(); } SmallVector NewMask(Size, -1); for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][0]; for (int i = 0; i != LaneSize; ++i) { int M = -1; if (Src >= 0) M = Src * LaneSize + i; NewMask[Lane * LaneSize + i] = M; } } SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); // Ensure we didn't get back the shuffle we started with. // FIXME: This is a hack to make up for some splat handling code in // getVectorShuffle. if (isa(NewV1) && cast(NewV1)->getMask() == Mask) return SDValue(); for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][1]; for (int i = 0; i != LaneSize; ++i) { int M = -1; if (Src >= 0) M = Src * LaneSize + i; NewMask[Lane * LaneSize + i] = M; } } SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); // Ensure we didn't get back the shuffle we started with. // FIXME: This is a hack to make up for some splat handling code in // getVectorShuffle. if (isa(NewV2) && cast(NewV2)->getMask() == Mask) return SDValue(); for (int i = 0; i != Size; ++i) { NewMask[i] = RepeatMask[i % LaneSize]; if (NewMask[i] < 0) continue; NewMask[i] += (i / LaneSize) * LaneSize; } return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); } /// If the input shuffle mask results in a vector that is undefined in all upper /// or lower half elements and that mask accesses only 2 halves of the /// shuffle's operands, return true. A mask of half the width with mask indexes /// adjusted to access the extracted halves of the original shuffle operands is /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or /// lower half of each input operand is accessed. static bool getHalfShuffleMask(ArrayRef Mask, MutableArrayRef HalfMask, int &HalfIdx1, int &HalfIdx2) { assert((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"); // Exactly one half of the result must be undef to allow narrowing. bool UndefLower = isUndefLowerHalf(Mask); bool UndefUpper = isUndefUpperHalf(Mask); if (UndefLower == UndefUpper) return false; unsigned HalfNumElts = HalfMask.size(); unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0; HalfIdx1 = -1; HalfIdx2 = -1; for (unsigned i = 0; i != HalfNumElts; ++i) { int M = Mask[i + MaskIndexOffset]; if (M < 0) { HalfMask[i] = M; continue; } // Determine which of the 4 half vectors this element is from. // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. int HalfIdx = M / HalfNumElts; // Determine the element index into its half vector source. int HalfElt = M % HalfNumElts; // We can shuffle with up to 2 half vectors, set the new 'half' // shuffle mask accordingly. if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) { HalfMask[i] = HalfElt; HalfIdx1 = HalfIdx; continue; } if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) { HalfMask[i] = HalfElt + HalfNumElts; HalfIdx2 = HalfIdx; continue; } // Too many half vectors referenced. return false; } return true; } /// Given the output values from getHalfShuffleMask(), create a half width /// shuffle of extracted vectors followed by an insert back to full width. static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG) { assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?"); assert(V1.getValueType().isSimple() && "Expecting only simple types"); MVT VT = V1.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); unsigned HalfNumElts = NumElts / 2; MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); auto getHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); SDValue V = (HalfIdx < 2 ? V1 : V2); HalfIdx = (HalfIdx % 2) * HalfNumElts; return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, DAG.getIntPtrConstant(HalfIdx, DL)); }; // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset SDValue Half1 = getHalfVector(HalfIdx1); SDValue Half2 = getHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); unsigned Offset = UndefLower ? HalfNumElts : 0; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); } /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. /// This allows for fast cases such as subvector extraction/insertion /// or shuffling smaller vector types which can lower more efficiently. static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"); bool UndefLower = isUndefLowerHalf(Mask); if (!UndefLower && !isUndefUpperHalf(Mask)) return SDValue(); assert((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"); // Upper half is undef and lower half is whole upper subvector. // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> unsigned NumElts = VT.getVectorNumElements(); unsigned HalfNumElts = NumElts / 2; MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); if (!UndefLower && isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(HalfNumElts, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(0, DL)); } // Lower half is undef and upper half is whole lower subvector. // e.g. vector_shuffle or if (UndefLower && isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(HalfNumElts, DL)); } int HalfIdx1, HalfIdx2; SmallVector HalfMask(HalfNumElts); if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2)) return SDValue(); assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); // Only shuffle the halves of the inputs when useful. unsigned NumLowerHalves = (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); unsigned NumUpperHalves = (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"); // Determine the larger pattern of undef/halves, then decide if it's worth // splitting the shuffle based on subtarget capabilities and types. unsigned EltWidth = VT.getVectorElementType().getSizeInBits(); if (!UndefLower) { // XXXXuuuu: no insert is needed. // Always extract lowers when setting lower - these are all free subreg ops. if (NumUpperHalves == 0) return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, UndefLower, DAG); if (NumUpperHalves == 1) { // AVX2 has efficient 32/64-bit element cross-lane shuffles. if (Subtarget.hasAVX2()) { // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask) && (!isSingleSHUFPSMask(HalfMask) || Subtarget.hasFastVariableShuffle())) return SDValue(); // If this is a unary shuffle (assume that the 2nd operand is // canonicalized to undef), then we can use vpermpd. Otherwise, we // are better off extracting the upper half of 1 operand and using a // narrow shuffle. if (EltWidth == 64 && V2.isUndef()) return SDValue(); } // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. if (Subtarget.hasAVX512() && VT.is512BitVector()) return SDValue(); // Extract + narrow shuffle is better than the wide alternative. return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, UndefLower, DAG); } // Don't extract both uppers, instead shuffle and then extract. assert(NumUpperHalves == 2 && "Half vector count went wrong"); return SDValue(); } // UndefLower - uuuuXXXX: an insert to high half is required if we split this. if (NumUpperHalves == 0) { // AVX2 has efficient 64-bit element cross-lane shuffles. // TODO: Refine to account for unary shuffle, splat, and other masks? if (Subtarget.hasAVX2() && EltWidth == 64) return SDValue(); // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. if (Subtarget.hasAVX512() && VT.is512BitVector()) return SDValue(); // Narrow shuffle + insert is better than the wide alternative. return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, UndefLower, DAG); } // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert. return SDValue(); } /// Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// /// This returns true if the elements from a particular input are already in the /// slot required by the given mask and require no permutation. static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) return false; return true; } /// Handle case where shuffle sources are coming from the same 128-bit lane and /// every lane can be represented as the same repeating mask - allowing us to /// shuffle the sources with the repeating shuffle and then permute the result /// to the destination lanes. static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; // On AVX2 we may be able to just shuffle the lowest elements and then // broadcast the result. if (Subtarget.hasAVX2()) { for (unsigned BroadcastSize : {16, 32, 64}) { if (BroadcastSize <= VT.getScalarSizeInBits()) continue; int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits(); // Attempt to match a repeating pattern every NumBroadcastElts, // accounting for UNDEFs but only references the lowest 128-bit // lane of the inputs. auto FindRepeatingBroadcastMask = [&](SmallVectorImpl &RepeatMask) { for (int i = 0; i != NumElts; i += NumBroadcastElts) for (int j = 0; j != NumBroadcastElts; ++j) { int M = Mask[i + j]; if (M < 0) continue; int &R = RepeatMask[j]; if (0 != ((M % NumElts) / NumLaneElts)) return false; if (0 <= R && R != M) return false; R = M; } return true; }; SmallVector RepeatMask((unsigned)NumElts, -1); if (!FindRepeatingBroadcastMask(RepeatMask)) continue; // Shuffle the (lowest) repeated elements in place for broadcast. SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask); // Shuffle the actual broadcast. SmallVector BroadcastMask((unsigned)NumElts, -1); for (int i = 0; i != NumElts; i += NumBroadcastElts) for (int j = 0; j != NumBroadcastElts; ++j) BroadcastMask[i + j] = j; return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT), BroadcastMask); } } // Bail if the shuffle mask doesn't cross 128-bit lanes. if (!is128BitLaneCrossingShuffleMask(VT, Mask)) return SDValue(); // Bail if we already have a repeated lane shuffle mask. SmallVector RepeatedShuffleMask; if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask)) return SDValue(); // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes. int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1; int NumSubLanes = NumLanes * SubLaneScale; int NumSubLaneElts = NumLaneElts / SubLaneScale; // Check that all the sources are coming from the same lane and see if we can // form a repeating shuffle mask (local to each sub-lane). At the same time, // determine the source sub-lane for each destination sub-lane. int TopSrcSubLane = -1; SmallVector Dst2SrcSubLanes((unsigned)NumSubLanes, -1); SmallVector RepeatedSubLaneMasks[2] = { SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef), SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef)}; for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) { // Extract the sub-lane mask, check that it all comes from the same lane // and normalize the mask entries to come from the first lane. int SrcLane = -1; SmallVector SubLaneMask((unsigned)NumSubLaneElts, -1); for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { int M = Mask[(DstSubLane * NumSubLaneElts) + Elt]; if (M < 0) continue; int Lane = (M % NumElts) / NumLaneElts; if ((0 <= SrcLane) && (SrcLane != Lane)) return SDValue(); SrcLane = Lane; int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts); SubLaneMask[Elt] = LocalM; } // Whole sub-lane is UNDEF. if (SrcLane < 0) continue; // Attempt to match against the candidate repeated sub-lane masks. for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) { auto MatchMasks = [NumSubLaneElts](ArrayRef M1, ArrayRef M2) { for (int i = 0; i != NumSubLaneElts; ++i) { if (M1[i] < 0 || M2[i] < 0) continue; if (M1[i] != M2[i]) return false; } return true; }; auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane]; if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask)) continue; // Merge the sub-lane mask into the matching repeated sub-lane mask. for (int i = 0; i != NumSubLaneElts; ++i) { int M = SubLaneMask[i]; if (M < 0) continue; assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && "Unexpected mask element"); RepeatedSubLaneMask[i] = M; } // Track the top most source sub-lane - by setting the remaining to UNDEF // we can greatly simplify shuffle matching. int SrcSubLane = (SrcLane * SubLaneScale) + SubLane; TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); Dst2SrcSubLanes[DstSubLane] = SrcSubLane; break; } // Bail if we failed to find a matching repeated sub-lane mask. if (Dst2SrcSubLanes[DstSubLane] < 0) return SDValue(); } assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && "Unexpected source lane"); // Create a repeating shuffle mask for the entire vector. SmallVector RepeatedMask((unsigned)NumElts, -1); for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) { int Lane = SubLane / SubLaneScale; auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale]; for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { int M = RepeatedSubLaneMask[Elt]; if (M < 0) continue; int Idx = (SubLane * NumSubLaneElts) + Elt; RepeatedMask[Idx] = M + (Lane * NumLaneElts); } } SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); // Shuffle each source sub-lane to its destination. SmallVector SubLaneMask((unsigned)NumElts, -1); for (int i = 0; i != NumElts; i += NumSubLaneElts) { int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; if (SrcSubLane < 0) continue; for (int j = 0; j != NumSubLaneElts; ++j) SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); } return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), SubLaneMask); } static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, unsigned &ShuffleImm, ArrayRef Mask) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. ShuffleImm = 0; bool ShufpdMask = true; bool CommutableMask = true; for (int i = 0; i < NumElts; ++i) { if (Mask[i] == SM_SentinelUndef) continue; if (Mask[i] < 0) return false; int Val = (i & 6) + NumElts * (i & 1); int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1); if (Mask[i] < Val || Mask[i] > Val + 1) ShufpdMask = false; if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) CommutableMask = false; ShuffleImm |= (Mask[i] % 2) << i; } if (ShufpdMask) return true; if (CommutableMask) { std::swap(V1, V2); return true; } return false; } static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& "Unexpected data type for VSHUFPD"); unsigned Immediate = 0; if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) return SDValue(); return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, DAG.getConstant(Immediate, DL, MVT::i8)); } /// Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use low duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowered with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, DAG.getConstant(VPERMILPMask, DL, MVT::i8)); } // With AVX2 we have direct support for this permutation. if (Subtarget.hasAVX2()) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise, fall back. return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) return Op; // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 4-lane 64-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on both lanes. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { SmallVector PSHUFDMask; scaleShuffleMask(2, RepeatedMask, PSHUFDMask); return DAG.getBitcast( MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, DAG.getBitcast(MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } // AVX2 provides a direct instruction for permuting a single input across // lanes. return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or VEXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; } // Try to use PALIGNR. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!isShuffleMaskInputInPlace(0, Mask) && !isShuffleMaskInputInPlace(1, Mask)) if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit floating point shuffles. /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"); // Use even/odd duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return V; // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. if (V2.isUndef()) { SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask); if (Subtarget.hasAVX2()) return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code than vblend by using // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && !Subtarget.hasAVX512()) if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the two 128-bit // lanes. SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask); if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; } // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. if (V2.isUndef()) { SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v8i32, ShufPS); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 16-lane 16-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return V; if (V2.isUndef()) { // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget); } SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v16 case. return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } } if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512BWVL can lower to VPERMW. if (Subtarget.hasBWI() && Subtarget.hasVLX()) return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 32-lane 8-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return V; // There are no generalized cross-lane shuffle operations available on i8 // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget); } if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512VBMIVL can lower to VPERMB. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG); } /// High-level routine to lower various 256-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = VT.getVectorNumElements(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // There is a really nice hard cut-over between AVX1 and AVX2 that means we // can check for those subtargets here and avoid much of the subtarget // querying in the per-vector-type lowering routines. With AVX1 we have // essentially *zero* ability to manipulate a 256-bit vector with integer // types. Since we'll use floating point types there eventually, just // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget.hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); if (ElementBits < 32) { // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); V1 = DAG.getBitcast(FpVT, V1); V2 = DAG.getBitcast(FpVT, V2); return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); } switch (VT.SimpleTy) { case MVT::v4f64: return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i64: return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8f32: return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i32: return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i16: return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i8: return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); } } /// Try to lower a vector shuffle as a 128-bit shuffles. static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); // To handle 256 bit vector requires VLX and most probably // function lowerV2X128VectorShuffle() is better solution. assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."); // TODO - use Zeroable like we do for lowerV2X128VectorShuffle? SmallVector WidenedMask; if (!canWidenShuffleElements(Mask, WidenedMask)) return SDValue(); // Try to use an insert into a zero vector. if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 && (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) { unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4; MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), LoV, DAG.getIntPtrConstant(0, DL)); } // Check for patterns which can be matched with a single insert of a 256-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4); SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec, DAG.getIntPtrConstant(4, DL)); } assert(WidenedMask.size() == 4); // See if this is an insertion of the lower 128-bits of V2 into V1. bool IsInsert = true; int V2Index = -1; for (int i = 0; i < 4; ++i) { assert(WidenedMask[i] >= -1); if (WidenedMask[i] < 0) continue; // Make sure all V1 subvectors are in place. if (WidenedMask[i] < 4) { if (WidenedMask[i] != i) { IsInsert = false; break; } } else { // Make sure we only have a single V2 index and its the lowest 128-bits. if (V2Index >= 0 || WidenedMask[i] != 4) { IsInsert = false; break; } V2Index = i; } } if (IsInsert && V2Index >= 0) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, DAG.getIntPtrConstant(0, DL)); return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); } // Try to lower to vshuf64x2/vshuf32x4. SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; unsigned PermMask = 0; // Insure elements came from the same Op. for (int i = 0; i < 4; ++i) { assert(WidenedMask[i] >= -1); if (WidenedMask[i] < 0) continue; SDValue Op = WidenedMask[i] >= 4 ? V2 : V1; unsigned OpIndex = i / 2; if (Ops[OpIndex].isUndef()) Ops[OpIndex] = Op; else if (Ops[OpIndex] != Op) return SDValue(); // Convert the 128-bit shuffle mask selection values into 128-bit selection // bits defined by a vshuf64x2 instruction's immediate control byte. PermMask |= (WidenedMask[i] % 4) << (i * 2); } return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], DAG.getConstant(PermMask, DL, MVT::i8)); } /// Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (V2.isUndef()) { // Use low duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) { // Non-half-crossing single input shuffles can be lowered with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) | ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) | ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, DAG.getConstant(VPERMILPMask, DL, MVT::i8)); } SmallVector RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); } if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2, Subtarget, DAG)) return Shuf128; if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); // Use even/odd duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1); if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Otherwise, fall back to a SHUFPS sequence. return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } // If we have a single input shuffle with different shuffle patterns in the // 128-bit lanes and don't lane cross, use variable mask VPERMILPS. if (V2.isUndef() && !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask); } // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four // 128-bit lanes. SmallVector Repeated128Mask; if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { SmallVector PSHUFDMask; scaleShuffleMask(2, Repeated128Mask, PSHUFDMask); return DAG.getBitcast( MVT::v8i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, DAG.getBitcast(MVT::v16i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } SmallVector Repeated256Mask; if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1, V2, Subtarget, DAG)) return Shuf128; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use PALIGNR. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the four 128-bit // lanes. SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask); if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use byte rotation instructions. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Assume that a single SHUFPS is faster than using a permv shuffle. // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (V2.isUndef()) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v32 case. return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); } } if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // VBMI can use VPERMV/VPERMV3 byte shuffles. if (Subtarget.hasVBMI()) return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (!V2.isUndef()) if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return Result; // FIXME: Implement direct support for this type! return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } /// High-level routine to lower various 512-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = Mask.size(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have support for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i16: return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64i8: return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); } } // Determine if this shuffle can be implemented with a KSHIFT instruction. // Returns the shift amount if possible or -1 if not. This is a simplified // version of matchShuffleAsShift. static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef Mask, int MaskOffset, const APInt &Zeroable) { int Size = Mask.size(); auto CheckZeros = [&](int Shift, bool Left) { for (int j = 0; j < Shift; ++j) if (!Zeroable[j + (Left ? 0 : (Size - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, bool Left) { unsigned Pos = Left ? Shift : 0; unsigned Low = Left ? 0 : Shift; unsigned Len = Size - Shift; return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset); }; for (int Shift = 1; Shift != Size; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) { Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR; return Shift; } return -1; } // Lower vXi1 vector shuffles. // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); unsigned NumElts = Mask.size(); // Try to recognize shuffles that are just padding a subvector with zeros. unsigned SubvecElts = 0; for (int i = 0; i != (int)NumElts; ++i) { if (Mask[i] >= 0 && Mask[i] != i) break; ++SubvecElts; } assert(SubvecElts != NumElts && "Identity shuffle?"); // Clip to a power 2. SubvecElts = PowerOf2Floor(SubvecElts); // Make sure the number of zeroable bits in the top at least covers the bits // not covered by the subvector. if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) { MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), Extract, DAG.getIntPtrConstant(0, DL)); } // Try to match KSHIFTs. // TODO: Support narrower than legal shifts by widening and extracting. if (NumElts >= 16 || (Subtarget.hasDQI() && NumElts == 8)) { unsigned Offset = 0; for (SDValue V : { V1, V2 }) { unsigned Opcode; int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable); if (ShiftAmt >= 0) return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(ShiftAmt, DL, MVT::i8)); Offset += NumElts; // Increment for next iteration. } } MVT ExtVT; switch (VT.SimpleTy) { default: llvm_unreachable("Expected a vector of i1 elements"); case MVT::v2i1: ExtVT = MVT::v2i64; break; case MVT::v4i1: ExtVT = MVT::v4i32; break; case MVT::v8i1: // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit // shuffle. ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64; break; case MVT::v16i1: // Take 512-bit type, unless we are avoiding 512-bit types and have the // 256-bit operation available. ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16; break; case MVT::v32i1: // Take 512-bit type, unless we are avoiding 512-bit types and have the // 256-bit operation available. assert(Subtarget.hasBWI() && "Expected AVX512BW support"); ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8; break; case MVT::v64i1: ExtVT = MVT::v64i8; break; } V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask); // i1 was sign extended we can use X86ISD::CVT2MASK. int NumElems = VT.getVectorNumElements(); if ((Subtarget.hasBWI() && (NumElems >= 32)) || (Subtarget.hasDQI() && (NumElems < 32))) return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), Shuffle, ISD::SETGT); return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle); } /// Helper function that returns true if the shuffle mask should be /// commuted to improve canonicalization. static bool canonicalizeShuffleMaskWithCommute(ArrayRef Mask) { int NumElements = Mask.size(); int NumV1Elements = 0, NumV2Elements = 0; for (int M : Mask) if (M < 0) continue; else if (M < NumElements) ++NumV1Elements; else ++NumV2Elements; // Commute the shuffle as needed such that more elements come from V1 than // V2. This allows us to match the shuffle pattern strictly on how many // elements come from V1 without handling the symmetric cases. if (NumV2Elements > NumV1Elements) return true; assert(NumV1Elements > 0 && "No V1 indices"); if (NumV2Elements == 0) return false; // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. When that is tied, // ensure that the sum of indices for V1 is equal to or lower than the sum // indices for V2. When those are equal, try to ensure that the number of odd // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : Mask.slice(0, NumElements / 2)) if (M >= NumElements) ++LowV2Elements; else if (M >= 0) ++LowV1Elements; if (LowV2Elements > LowV1Elements) return true; if (LowV2Elements == LowV1Elements) { int SumV1Indices = 0, SumV2Indices = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= NumElements) SumV2Indices += i; else if (Mask[i] >= 0) SumV1Indices += i; if (SumV2Indices < SumV1Indices) return true; if (SumV2Indices == SumV1Indices) { int NumV1OddIndices = 0, NumV2OddIndices = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= NumElements) NumV2OddIndices += i % 2; else if (Mask[i] >= 0) NumV1OddIndices += i % 2; if (NumV2OddIndices < NumV1OddIndices) return true; } } } return false; } /// Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 /// vector shuffles. Most of the specific lowering strategies are encapsulated /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc DL(Op); bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); assert((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"); bool V1IsUndef = V1.isUndef(); bool V2IsUndef = V2.isUndef(); if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); // When we create a shuffle node we put the UNDEF node to second operand, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) return DAG.getCommutedVectorShuffle(*SVOp); // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef && any_of(Mask, [NumElements](int M) { return M >= NumElements; })) { SmallVector NewMask(Mask.begin(), Mask.end()); for (int &M : NewMask) if (M >= NumElements) M = -1; return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); } // Check for illegal shuffle mask element index values. int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; assert(llvm::all_of(Mask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index"); // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2); if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); // Create an alternative mask with info about zeroable elements. // Here we do not set undef elements as zeroable. SmallVector ZeroableMask(Mask.begin(), Mask.end()); if (V2IsZero) { assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); for (int i = 0; i != NumElements; ++i) if (Mask[i] != SM_SentinelUndef && Zeroable[i]) ZeroableMask[i] = SM_SentinelZero; } // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(ZeroableMask, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Broadcast; MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); int NewNumElts = NumElements / 2; MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts); // Make sure that the new vector type is legal. For example, v2f64 isn't // legal on SSE1. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { if (V2IsZero) { // Modify the new Mask to take all zeros from the all-zero vector. // Choose indices that are blend-friendly. bool UsedZeroVector = false; assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && "V2's non-undef elements are used?!"); for (int i = 0; i != NewNumElts; ++i) if (WidenedMask[i] == SM_SentinelZero) { WidenedMask[i] = i + NewNumElts; UsedZeroVector = true; } // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits // some elements to be undef. if (UsedZeroVector) V2 = getZeroVector(NewVT, Subtarget, DAG, DL); } V1 = DAG.getBitcast(NewVT, V1); V2 = DAG.getBitcast(NewVT, V2); return DAG.getBitcast( VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask)); } } // Commute the shuffle if it will improve canonicalization. if (canonicalizeShuffleMaskWithCommute(Mask)) return DAG.getCommutedVectorShuffle(*SVOp); if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) return V; // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is256BitVector()) return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is512BitVector()) return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (Is1BitVector) return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } /// Try to lower a VSELECT instruction to a vector shuffle. static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); // Only non-legal VSELECTs reach this lowering, convert those into generic // shuffles and re-use the shuffle lowering path for blends. SmallVector Mask; if (createShuffleMaskFromVSELECT(Mask, Cond)) return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask); return SDValue(); } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) return SDValue(); // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) return BlendOp; // If this VSELECT has a vector if i1 as a mask, it will be directly matched // with patterns on the mask registers on AVX-512. MVT CondVT = Cond.getSimpleValueType(); unsigned CondEltSize = Cond.getScalarValueSizeInBits(); if (CondEltSize == 1) return Op; // Variable blends are only legal from SSE4.1 onward. if (!Subtarget.hasSSE41()) return SDValue(); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition // into an i1 condition so that we can use the mask-based 512-bit blend // instructions. if (VT.getSizeInBits() == 512) { // Build a mask by testing the condition against zero. MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond, DAG.getConstant(0, dl, CondVT), ISD::SETNE); // Now return a new VSELECT using the mask. return DAG.getSelect(dl, VT, Mask, LHS, RHS); } // SEXT/TRUNC cases where the mask doesn't match the destination size. if (CondEltSize != EltSize) { // If we don't have a sign splat, rely on the expansion. if (CondEltSize != DAG.ComputeNumSignBits(Cond)) return SDValue(); MVT NewCondSVT = MVT::getIntegerVT(EltSize); MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts); Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT); return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS); } // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. switch (VT.SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; case MVT::v32i8: // The byte blends for AVX vectors were introduced only in AVX2. if (Subtarget.hasAVX2()) return Op; return SDValue(); case MVT::v8i16: case MVT::v16i16: { // Bitcast everything to the vXi8 type and use a vXi8 vselect. MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2); Cond = DAG.getBitcast(CastVT, Cond); LHS = DAG.getBitcast(CastVT, LHS); RHS = DAG.getBitcast(CastVT, RHS); SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS); return DAG.getBitcast(VT, Select); } } } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the // result has a single use which is a store or a bitcast to i32. And in // the case of a store, it's not worth it if the index is a constant 0, // because a MOVSSmr can be used instead, which is smaller and faster. if (!Op.hasOneUse()) return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || isNullConstant(Op.getOperand(1))) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), Op.getOperand(1)); return DAG.getBitcast(MVT::f32, Extract); } if (VT == MVT::i32 || VT == MVT::i64) { // ExtractPS/pextrq works with constant index. if (isa(Op.getOperand(1))) return Op; } return SDValue(); } /// Extract one bit from mask vector, like v16i1 or v8i1. /// AVX-512 feature. static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Vec = Op.getOperand(0); SDLoc dl(Vec); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, // extend vector to VR512/128 if (!isa(Idx)) { unsigned NumElts = VecVT.getVectorNumElements(); // Extending v8i1/v16i1 to 512-bit get better performance on KNL // than extending to 128/256bit. MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } unsigned IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0) // the operation is legal return Op; // Extend to natively supported kshift. unsigned NumElems = VecVT.getVectorNumElements(); MVT WideVecVT = VecVT; if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) { WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT, DAG.getUNDEF(WideVecVT), Vec, DAG.getIntPtrConstant(0, dl)); } // Use kshiftr instruction to move to the lower element. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, DAG.getConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); } SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG, Subtarget); if (!isa(Idx)) { // Its more profitable to go through memory (1 cycles throughput) // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) // IACA tool was used to get performance estimation // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) // // example : extractelement <16 x i8> %a, i32 %i // // Block Throughput: 3.00 Cycles // Throughput Bottleneck: Port5 // // | Num Of | Ports pressure in cycles | | // | Uops | 0 - DV | 5 | 6 | 7 | | // --------------------------------------------- // | 1 | | 1.0 | | | CP | vmovd xmm1, edi // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0 // Total Num Of Uops: 4 // // // Block Throughput: 1.00 Cycles // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4 // // | | Ports pressure in cycles | | // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | // --------------------------------------------------------- // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] // Total Num Of Uops: 4 return SDValue(); } unsigned IdxVal = cast(Idx)->getZExtValue(); // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. if (VecVT.is256BitVector() || VecVT.is512BitVector()) { // Get the 128-bit vector. Vec = extract128BitVector(Vec, IdxVal, DAG, dl); MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 // this can be done with a mask. IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(IdxVal, dl)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); MVT VT = Op.getSimpleValueType(); if (VT.getSizeInBits() == 16) { // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless // we're going to zero extend the register or fold the store (SSE41 only). if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) && !(Subtarget.hasSSE41() && MayFoldIntoStore(Op))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); // Transform it so it match pextrw which produces a 32-bit result. SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (Subtarget.hasSSE41()) if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; // TODO: We only extract a single element from v16i8, we can probably afford // to be more aggressive here before using the default approach of spilling to // stack. if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) { // Extract either the lowest i32 or any i16, and extract the sub-byte. int DWordIdx = IdxVal / 4; if (DWordIdx == 0) { SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), DAG.getIntPtrConstant(DWordIdx, dl)); int ShiftVal = (IdxVal % 4) * 8; if (ShiftVal != 0) Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, DAG.getConstant(ShiftVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } int WordIdx = IdxVal / 2; SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, DAG.getBitcast(MVT::v8i16, Vec), DAG.getIntPtrConstant(WordIdx, dl)); int ShiftVal = (IdxVal % 2) * 8; if (ShiftVal != 0) Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, DAG.getConstant(ShiftVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } if (VT.getSizeInBits() == 32) { if (IdxVal == 0) return Op; // SHUFPS the element to the lowest double word, then movss. int Mask[4] = { static_cast(IdxVal), -1, -1, -1 }; Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() == 64) { // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. if (IdxVal == 0) return Op; // UNPCKHPD the element to the lowest double word, then movsd. // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } return SDValue(); } /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue Elt = Op.getOperand(1); SDValue Idx = Op.getOperand(2); MVT VecVT = Vec.getSimpleValueType(); if (!isa(Idx)) { // Non constant index. Extend source and destination, // insert element and then truncate the result. unsigned NumElts = VecVT.getVectorNumElements(); MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); } // Copy into a k-register, extract to v1i1 and insert_subvector. SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Op.getOperand(2)); } SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG, Subtarget); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); auto *N2C = dyn_cast(N2); if (!N2C || N2C->getAPIntValue().uge(NumElts)) return SDValue(); uint64_t IdxVal = N2C->getZExtValue(); bool IsZeroElt = X86::isZeroNode(N1); bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); // If we are inserting a element, see if we can do this more efficiently with // a blend shuffle with a rematerializable vector than a costly integer // insertion. if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && 16 <= EltVT.getSizeInBits()) { SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) BlendMask.push_back(i == IdxVal ? i + NumElts : i); SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) : getOnesVector(VT, DAG, dl); return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { // With a 256-bit vector, we can insert into the zero element efficiently // using a blend if we have AVX or AVX2 and the right data type. if (VT.is256BitVector() && IdxVal == 0) { // TODO: It is worthwhile to cast integer to floating point and back // and incur a domain crossing penalty if that's what we'll end up // doing anyway after extracting to a 128-bit vector. if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || (Subtarget.hasAVX2() && EltVT == MVT::i32)) { SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); N2 = DAG.getIntPtrConstant(1, dl); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); } } // Get the desired 128-bit vector chunk. SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); assert(isPowerOf2_32(NumEltsIn128)); // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getIntPtrConstant(IdxIn128, dl)); // Insert the changed part back into the bigger vector return insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); // This will be just movd/movq/movss/movsd. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) && (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || EltVT == MVT::i64)) { N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); } // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. SSE41 required for pinsrb. if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { unsigned Opc; if (VT == MVT::v8i16) { assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW"); Opc = X86ISD::PINSRW; } else { assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector"); assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB"); Opc = X86ISD::PINSRB; } if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(Opc, dl, VT, N0, N1, N2); } if (Subtarget.hasSSE41()) { if (EltVT == MVT::f32) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into // these bits. For example (insert (extract, 3), 2) could be matched by // putting the '3' into bits [7:6] of X86ISD::INSERTPS. // Bits [5:4] of the constant are the destination select. This is the // value of the incoming immediate. // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather // than an insertps. Blends are simpler operations in hardware and so // will always have equal or better performance than insertps. // But if optimizing for size and there's a load folding opportunity, // generate insertps because blendps does not have a 32-bit memory // operand form. N2 = DAG.getIntPtrConstant(1, dl); N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); } N2 = DAG.getIntPtrConstant(IdxVal << 4, dl); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); } // PINSR* works with constant index. if (EltVT == MVT::i32 || EltVT == MVT::i64) return Op; } return SDValue(); } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); // It's always cheaper to replace a xor+movd with xorps and simplifies further // combines. if (X86::isZeroNode(Op.getOperand(0))) return getZeroVector(OpVT, Subtarget, DAG, dl); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. unsigned SizeFactor = OpVT.getSizeInBits() / 128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"); // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. if (OpVT == MVT::v4i32) return Op; SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); return DAG.getBitcast( OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1); return insert1BitVector(Op, DAG, Subtarget); } static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"); SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); if (!isa(Idx)) return SDValue(); unsigned IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0) // the operation is legal return Op; MVT VecVT = Vec.getSimpleValueType(); unsigned NumElems = VecVT.getVectorNumElements(); // Extend to natively supported kshift. MVT WideVecVT = VecVT; if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) { WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT, DAG.getUNDEF(WideVecVT), Vec, DAG.getIntPtrConstant(0, dl)); } // Shift to the LSB. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, DAG.getConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); } // Returns the appropriate wrapper opcode for a global reference. unsigned X86TargetLowering::getGlobalWrapperKind( const GlobalValue *GV, const unsigned char OpFlags) const { // References to absolute symbols are never PC-relative. if (GV && GV->isAbsoluteSymbolRef()) return X86ISD::Wrapper; CodeModel::Model M = getTargetMachine().getCodeModel(); if (Subtarget.isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) return X86ISD::WrapperRIP; // GOTPCREL references must always use RIP. if (OpFlags == X86II::MO_GOTPCREL) return X86ISD::WrapperRIP; return X86ISD::Wrapper; } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOV32ri. SDValue X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetConstantPool( CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); SDLoc DL(CP); Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } return Result; } SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); SDLoc DL(JT); Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); return Result; } SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } SDValue X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget.classifyBlockAddressReference(); const BlockAddress *BA = cast(Op)->getBlockAddress(); int64_t Offset = cast(Op)->getOffset(); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } return Result; } /// Creates target global address or external symbol nodes for calls or /// other uses. SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall) const { // Unpack the global address or external symbol. const SDLoc &dl = SDLoc(Op); const GlobalValue *GV = nullptr; int64_t Offset = 0; const char *ExternalSym = nullptr; if (const auto *G = dyn_cast(Op)) { GV = G->getGlobal(); Offset = G->getOffset(); } else { const auto *ES = cast(Op); ExternalSym = ES->getSymbol(); } // Calculate some flags for address lowering. const Module &Mod = *DAG.getMachineFunction().getFunction().getParent(); unsigned char OpFlags; if (ForCall) OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod); else OpFlags = Subtarget.classifyGlobalReference(GV, Mod); bool HasPICReg = isGlobalRelativeToPICBase(OpFlags); bool NeedsLoad = isGlobalStubReference(OpFlags); CodeModel::Model M = DAG.getTarget().getCodeModel(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; if (GV) { // Create a target global address if this is a global. If possible, fold the // offset into the global address reference. Otherwise, ADD it on later. int64_t GlobalOffset = 0; if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { std::swap(GlobalOffset, Offset); } Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags); } else { // If this is not a global address, this must be an external symbol. Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags); } // If this is a direct call, avoid the wrapper if we don't need to do any // loads or adds. This allows SDAG ISel to match direct calls. if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0) return Result; Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (HasPICReg) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. if (NeedsLoad) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. if (Offset != 0) Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, DAG.getConstant(Offset, dl, PtrVT)); return Result; } SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDLoc dl(GA); SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR : X86ISD::TLSADDR; if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } else { SDValue Ops[] = { Chain, TGA }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. MFI.setAdjustsStack(true); MFI.setHasCalls(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { SDValue InFlag; SDLoc dl(GA); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSGD); } static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool is64Bit) { SDLoc dl(GA); // Get the start address of the TLS block for this module. X86MachineFunctionInfo *MFI = DAG.getMachineFunction() .getInfo(); MFI->incNumLocalDynamicTLSAccesses(); SDValue Base; if (is64Bit) { Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InFlag; SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSLDM, /*LocalDynamic=*/true); } // Note: the CleanupLocalDynamicTLSPass will remove redundant computations // of Base. // Build x@dtpoff. unsigned char OperandFlags = X86II::MO_DTPOFF; unsigned WrapperKind = X86ISD::Wrapper; SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); // Add x@dtpoff with the base. return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); } // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC) { SDLoc dl(GA); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), is64Bit ? 257 : 256)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr)); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is // initialexec. unsigned WrapperKind = X86ISD::Wrapper; if (model == TLSModel::LocalExec) { OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; } else if (model == TLSModel::InitialExec) { if (is64Bit) { OperandFlags = X86II::MO_GOTTPOFF; WrapperKind = X86ISD::WrapperRIP; } else { OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; } } else { llvm_unreachable("Unexpected model"); } // emit "addl x@ntpoff,%eax" (local exec) // or "addl x@indntpoff,%eax" (initial exec) // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) { if (isPIC && !is64Bit) { Offset = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, MachinePointerInfo::getGOT(DAG.getMachineFunction())); } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().useEmulatedTLS()) return LowerToTLSEmulatedModel(GA, DAG); const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); bool PositionIndependent = isPositionIndependent(); if (Subtarget.isTargetELF()) { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: if (Subtarget.is64Bit()) return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); case TLSModel::LocalDynamic: return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), PositionIndependent); } llvm_unreachable("Unknown TLS model."); } if (Subtarget.isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ? X86ISD::WrapperRIP : X86ISD::Wrapper; // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. bool PIC32 = PositionIndependent && !Subtarget.is64Bit(); if (PIC32) OpFlag = X86II::MO_TLVP_PIC_BASE; else OpFlag = X86II::MO_TLVP; SDLoc DL(Op); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC32, the address is actually $g + Offset. if (PIC32) Offset = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); // Lowering the machine isd will make sure everything is in the right // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), DAG.getIntPtrConstant(0, DL, true), Chain.getValue(1), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); // And our return value (tls address) is in the standard call return value // location. unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } if (Subtarget.isOSWindows()) { // Just use the implicit TLS architecture // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage // ; from TEB // mov ecx, dword [rel _tls_index]: Load index (from C runtime) // mov rcx, qword [rdx+rcx*8] // mov eax, .tls$:tlsvar // [rax+rcx] contains the address // Windows 64bit: gs:0x58 // Windows 32bit: fs:__tls_array SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly // use its literal value of 0x2C. Value *Ptr = Constant::getNullValue(Subtarget.is64Bit() ? Type::getInt8PtrTy(*DAG.getContext(), 256) : Type::getInt32PtrTy(*DAG.getContext(), 257)); SDValue TlsArray = Subtarget.is64Bit() ? DAG.getIntPtrConstant(0x58, dl) : (Subtarget.isTargetWindowsGNU() ? DAG.getIntPtrConstant(0x2C, dl) : DAG.getExternalSymbol("_tls_array", PtrVT)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr)); SDValue res; if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { res = ThreadPointer; } else { // Load the _tls_index variable SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); if (Subtarget.is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, MachinePointerInfo(), MVT::i32); else IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo()); auto &DL = DAG.getDataLayout(); SDValue Scale = DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8); IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); } res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo()); // Get the offset of start of .tls section SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), X86II::MO_SECREL); SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); } llvm_unreachable("TLS not implemented for this target."); } /// Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. /// TODO: Can this be moved to general expansion code? static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); MVT VT = Op.getSimpleValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away // during isel. SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits - 1, dl, MVT::i8)); SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, dl, MVT::i8)) : DAG.getConstant(0, dl, VT); SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } // If the shift amount is larger or equal than the width of a part we can't // rely on the results of shld/shrd. Insert a test and select the appropriate // values for large shift amounts. SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, dl, MVT::i8)); SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode, DAG.getConstant(0, dl, MVT::i8), ISD::SETNE); SDValue Hi, Lo; if (Op.getOpcode() == ISD::SHL_PARTS) { Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2); Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3); } else { Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2); Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3); } return DAG.getMergeValues({ Lo, Hi }, dl); } static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"); SDLoc DL(Op); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Amt = Op.getOperand(2); bool IsFSHR = Op.getOpcode() == ISD::FSHR; if (VT.isVector()) { assert(Subtarget.hasVBMI2() && "Expected VBMI2"); if (IsFSHR) std::swap(Op0, Op1); APInt APIntShiftAmt; if (isConstantSplat(Amt, APIntShiftAmt)) { uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits()); return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8)); } return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, Op0, Op1, Amt); } assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); if (IsFSHR) std::swap(Op0, Op1); // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo. if (VT == MVT::i16) Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, DAG.getConstant(15, DL, Amt.getValueType())); unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD); return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt); } // Try to use a packed vector operation to handle i64 on 32-bit targets when // AVX512DQ is enabled. static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() || (VT != MVT::f32 && VT != MVT::f64)) return SDValue(); // Pack the i64 into a vector, do the operation and extract. // Using 256-bit to ensure result is 128-bits for f32 case. unsigned NumElts = Subtarget.hasVLX() ? 4 : 8; MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts); MVT VecVT = MVT::getVectorVT(VT, NumElts); SDLoc dl(Op); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); } static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget) { switch (Opcode) { case ISD::SINT_TO_FP: // TODO: Handle wider types with AVX/AVX512. if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) return false; // CVTDQ2PS or (V)CVTDQ2PD return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); case ISD::UINT_TO_FP: // TODO: Handle wider types and i64 elements. if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) return false; // VCVTUDQ2PS or VCVTUDQ2PD return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; default: return false; } } /// Given a scalar cast operation that is extracted from a vector, try to /// vectorize the cast op followed by extraction. This will avoid an expensive /// round-trip between XMM and GPR. static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // TODO: This could be enhanced to handle smaller integer types by peeking // through an extend. SDValue Extract = Cast.getOperand(0); MVT DestVT = Cast.getSimpleValueType(); if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Extract.getOperand(1))) return SDValue(); // See if we have a 128-bit vector cast op for this type of cast. SDValue VecOp = Extract.getOperand(0); MVT FromVT = VecOp.getSimpleValueType(); unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits(); MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM); MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM); if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget)) return SDValue(); // If we are extracting from a non-zero element, first shuffle the source // vector to allow extracting from element zero. SDLoc DL(Cast); if (!isNullConstant(Extract.getOperand(1))) { SmallVector Mask(FromVT.getVectorNumElements(), -1); Mask[0] = Extract.getConstantOperandVal(1); VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask); } // If the source vector is wider than 128-bits, extract the low part. Do not // create an unnecessarily wide vector cast op. if (FromVT != Vec128VT) VecOp = extract128BitVector(VecOp, 0, DAG, DL); // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, DAG.getIntPtrConstant(0, DL)); } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); } return SDValue(); } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); // These are really Legal; return the operand so the caller accepts it as // Legal. if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT)) return Op; if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) return Op; if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; SDValue ValueToStore = Op.getOperand(0); if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD SDLoc DL(Op); SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); if (useSSE) Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); unsigned ByteSize = SrcVT.getSizeInBits() / 8; FrameIndexSDNode *FI = dyn_cast(StackSlot); MachineMemOperand *LoadMMO; if (FI) { int SSFI = FI->getIndex(); LoadMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { LoadMMO = cast(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); } SDValue FILDOps[] = {Chain, StackSlot}; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, Tys, FILDOps, SrcVT, LoadMMO); if (useSSE) { Chain = Result.getValue(1); SDValue InFlag = Result.getValue(2); // FIXME: Currently the FST is glued to the FILD_FLAG. This // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); unsigned SSFISize = Op.getValueSizeInBits() / 8; int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag}; MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, Op.getValueType(), StoreMMO); Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); } return Result; } /// 64-bit unsigned integer to double expansion. static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // This algorithm is not obvious. Here it is what we're trying to output: /* movq %rax, %xmm0 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } #ifdef __SSE3__ haddpd %xmm0, %xmm0 #else pshufd $0x4e, %xmm0, %xmm1 addpd %xmm1, %xmm0 #endif */ SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); // Build some magic constants. static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); SmallVector CV1; CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)))); CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; if (Subtarget.hasSSE3()) { // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'. Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0, dl)); } /// 32-bit unsigned integer to float expansion. static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(0)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Load), DAG.getIntPtrConstant(0, dl)); // Or the load with the bias. SDValue Or = DAG.getNode( ISD::OR, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); // Handle final rounding. return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType()); } static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { if (Op.getSimpleValueType() != MVT::v2f64) return SDValue(); SDValue N0 = Op.getOperand(0); assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); // Legalize to v4i32 type. N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, DAG.getUNDEF(MVT::v2i32)); if (Subtarget.hasAVX512()) return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT, // but using v2i32 to v2f64 with X86ISD::CVTSI2P. SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32); SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); // Two to the power of half-word-size. SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64); // Clear upper part of LO, lower HI. SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask); SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI); fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW); SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO); // Add the two halves. return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO); } static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // The algorithm is the following: // #ifdef __SSE4_1__ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); // #else // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; // uint4 hi = (v >> 16) | (uint4) 0x53000000; // #endif // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; // We shouldn't use it when unsafe-fp-math is enabled though: we might later // reassociate the two FADDs, and if we do that, the algorithm fails // spectacularly (PR24512). // FIXME: If we ever have some kind of Machine FMF, this should be marked // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because // there's also the MachineCombiner reassociations happening on Machine IR. if (DAG.getTarget().Options.UnsafeFPMath) return SDValue(); SDLoc DL(Op); SDValue V = Op->getOperand(0); MVT VecIntVT = V.getSimpleValueType(); bool Is128 = VecIntVT == MVT::v4i32; MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, // abort early. if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && "Unsupported custom type"); // In the #idef/#else code, we have in common: // - The vector of constants: // -- 0x4b000000 // -- 0x53000000 // - A shift: // -- v >> 16 // Create the splat vector for 0x4b000000. SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT); // Create the splat vector for 0x53000000. SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT); // Create the right shift. SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT); SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); SDValue Low, High; if (Subtarget.hasSSE41()) { MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); } else { SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); // uint4 hi = (v >> 16) | (uint4) 0x53000000; High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); } // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). SDValue VecCstFAdd = DAG.getConstantFP( APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? SDValue FHigh = DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); // return (float4) lo + fhi; SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue N0 = Op.getOperand(0); MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); switch (SrcVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); case MVT::v2i32: return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl); case MVT::v4i32: case MVT::v8i32: assert(!Subtarget.hasAVX512()); return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); } } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { // Conversions from unsigned i32 to f32/f64 are legal, // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. return Op; } if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG, Subtarget); if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG, Subtarget); if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo()); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo()); SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); return Fild; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue ValueToStore = Op.getOperand(0); if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo()); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) int SSFI = cast(StackSlot)->getIndex(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, /* Alignment = */ 4); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); } // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), // just return an SDValue(). // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 // to i16, i32 or i64, and we lower it to a legal sequence and return the // result. SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); EVT TheVT = Op.getOperand(0).getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { // f16 must be promoted before using the lowering in this routine. // fp128 does not use this lowering. return SDValue(); } // If using FIST to compute an unsigned i64, we'll need some fixup // to handle values above the maximum signed i64. A FIST is always // used for the 32-bit subtarget, but also for f80 on a 64-bit target. bool UnsignedFixup = !IsSigned && DstTy == MVT::i64; if (!IsSigned && DstTy != MVT::i64) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); // We lower FP->int64 into FISTP64 followed by a load from a temporary // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getStoreSize(); int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. if (UnsignedFixup) { // // Conversion to unsigned i64 is implemented with a select, // depending on whether the source value fits in the range // of a signed i64. Let Thresh be the FP equivalent of // 0x8000000000000000ULL. // // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); // Fist-to-mem64 FistSrc // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent // to XOR'ing the high 32 bits with Adjust. // // Being a power of 2, Thresh is exactly representable in all FP formats. // For X87 we'd like to use the smallest FP type for this constant, but // for DAG type consistency we have to match the FP operand type. APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000)); LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; bool LosesInfo = false; if (TheVT == MVT::f64) // The rounding mode is irrelevant as the conversion should be exact. Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &LosesInfo); else if (TheVT == MVT::f80) Status = Thresh.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &LosesInfo); assert(Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"); SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); SDValue Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), Value, ThreshVal, ISD::SETLT); Adjust = DAG.getSelect(DL, MVT::i64, Cmp, DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(APInt::getSignMask(64), DL, MVT::i64)); SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT), Value, ThreshVal, ISD::SETLT); Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); } MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI); SDVTList Tys = DAG.getVTList(TheVT, MVT::Other); SDValue Ops[] = { Chain, StackSlot }; unsigned FLDSize = TheVT.getStoreSize(); assert(FLDSize <= MemSize && "Stack slot not big enough"); MachineMemOperand *MMO = MF.getMachineMemOperand( MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO); Chain = Value.getValue(1); } // Build the FP_TO_INT*_IN_MEM MachineMemOperand *MMO = MF.getMachineMemOperand( MPI, MachineMemOperand::MOStore, MemSize, MemSize); SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI); // If we need an unsigned fixup, XOR the result with adjust. if (UnsignedFixup) Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust); return Res; } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); unsigned Opc = Op.getOpcode(); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && "Unexpected extension opcode"); assert(VT.getVectorNumElements() == VT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && "Unexpected element type"); assert((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc); // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); return DAG.getNode(ExtendInVecOpc, dl, VT, In); } if (Subtarget.hasInt256()) return Op; // Optimize vectors in AVX mode: // // v8i16 -> v8i32 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32. // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. // Concat upper and lower parts. // // v4i32 -> v4i64 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64. // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In); // Short-circuit if we can determine that each 128-bit half is the same value. // Otherwise, this is difficult to match and optimize. if (auto *Shuf = dyn_cast(In)) if (hasIdenticalHalvesShuffleMask(Shuf->getMask())) return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo); SDValue ZeroVec = DAG.getConstant(0, dl, InVT); SDValue Undef = DAG.getUNDEF(InVT); bool NeedZero = Opc == ISD::ZERO_EXTEND; SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); OpHi = DAG.getBitcast(HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } // Helper to split and extend a v16i1 mask to v16i8 or v16i16. static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG) { assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT."); SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In, DAG.getIntPtrConstant(0, dl)); SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In, DAG.getIntPtrConstant(8, dl)); Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo); Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); SDLoc DL(Op); unsigned NumElts = VT.getVectorNumElements(); // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This // avoids a constant pool load. if (VT.getVectorElementType() != MVT::i8) { SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In); return DAG.getNode(ISD::SRL, DL, VT, Extend, DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT)); } // Extend VT if BWI is not supported. MVT ExtVT = VT; if (!Subtarget.hasBWI()) { // If v16i32 is to be avoided, we'll need to split and concatenate. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG); ExtVT = MVT::getVectorVT(MVT::i32, NumElts); } // Widen to 512-bits if VLX is not supported. MVT WideVT = ExtVT; if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { NumElts *= 512 / ExtVT.getSizeInBits(); InVT = MVT::getVectorVT(MVT::i1, NumElts); In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In, DAG.getIntPtrConstant(0, DL)); WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); } SDValue One = DAG.getConstant(1, DL, WideVT); SDValue Zero = DAG.getConstant(0, DL, WideVT); SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero); // Truncate if we had to extend above. if (VT != ExtVT) { WideVT = MVT::getVectorVT(MVT::i8, NumElts); SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal); } // Extract back to 128/256-bit if we widened. if (WideVT != VT) SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal, DAG.getIntPtrConstant(0, DL)); return SelectedVal; } static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); if (SVT.getVectorElementType() == MVT::i1) return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG); assert(Subtarget.hasAVX() && "Expected AVX support"); return LowerAVXExtend(Op, DAG, Subtarget); } /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS. /// It makes use of the fact that vectors with enough leading sign/zero bits /// prevent the PACKSS/PACKUS from saturating the results. /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates /// within each 128-bit lane. static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && "Unexpected PACK opcode"); assert(DstVT.isVector() && "VT not a vector?"); // Requires SSE2 but AVX512 has fast vector truncate. if (!Subtarget.hasSSE2()) return SDValue(); EVT SrcVT = In.getValueType(); // No truncation required, we might get here due to recursive calls. if (SrcVT == DstVT) return In; // We only support vector truncation to 64bits or greater from a // 128bits or greater source. unsigned DstSizeInBits = DstVT.getSizeInBits(); unsigned SrcSizeInBits = SrcVT.getSizeInBits(); if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0) return SDValue(); unsigned NumElems = SrcVT.getVectorNumElements(); if (!isPowerOf2_32(NumElems)) return SDValue(); LLVMContext &Ctx = *DAG.getContext(); assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation"); assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation"); EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2); // Pack to the largest type possible: // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. EVT InVT = MVT::i16, OutVT = MVT::i8; if (SrcVT.getScalarSizeInBits() > 16 && (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) { InVT = MVT::i32; OutVT = MVT::i16; } // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector. if (SrcVT.is128BitVector()) { InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits()); In = DAG.getBitcast(InVT, In); SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In); Res = extractSubVector(Res, 0, DAG, DL, 64); return DAG.getBitcast(DstVT, Res); } // Extract lower/upper subvectors. unsigned NumSubElts = NumElems / 2; SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2); SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2); unsigned SubSizeInBits = SrcSizeInBits / 2; InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits()); // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors. if (SrcVT.is256BitVector() && DstVT.is128BitVector()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); return DAG.getBitcast(DstVT, Res); } // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors. // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK). if (SrcVT.is512BitVector() && Subtarget.hasInt256()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits. SmallVector Mask; int Scale = 64 / OutVT.getScalarSizeInBits(); scaleShuffleMask(Scale, ArrayRef({ 0, 2, 1, 3 }), Mask); Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask); if (DstVT.is256BitVector()) return DAG.getBitcast(DstVT, Res); // If 512bit -> 128bit truncate another stage. EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); Res = DAG.getBitcast(PackedVT, Res); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } // Recursively pack lower/upper subvectors, concat result and pack again. assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater"); EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts); Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget); Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget); PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."); // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q. unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; if (InVT.getScalarSizeInBits() <= 16) { if (Subtarget.hasBWI()) { // legal, will go to VPMOVB2M, VPMOVW2M if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { // We need to shift to get the lsb into sign position. // Shift packed bytes not supported natively, bitcast to word MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); In = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In), DAG.getConstant(ShiftInx, DL, ExtVT)); In = DAG.getBitcast(InVT, In); } return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT); } // Use TESTD/Q, extended vector to packed dword/qword. assert((InVT.is256BitVector() || InVT.is128BitVector()) && "Unexpected vector type."); unsigned NumElts = InVT.getVectorNumElements(); assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements"); // We need to change to a wider element type that we have support for. // For 8 element vectors this is easy, we either extend to v8i32 or v8i64. // For 16 element vectors we extend to v16i32 unless we are explicitly // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors // we need to split into two 8 element vectors which we can extend to v8i32, // truncate and concat the results. There's an additional complication if // the original type is v16i8. In that case we can't split the v16i8 so // first we pre-extend it to v16i16 which we can split to v8i16, then extend // to v8i32, truncate that to v8i1 and concat the two halves. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) { if (InVT == MVT::v16i8) { // First we need to sign extend up to 256-bits so we can split that. InVT = MVT::v16i16; In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In); } SDValue Lo = extract128BitVector(In, 0, DAG, DL); SDValue Hi = extract128BitVector(In, 8, DAG, DL); // We're split now, just emit two truncates and a concat. The two // truncates will trigger legalization to come back to this function. Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo); Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } // We either have 8 elements or we're allowed to use 512-bit vectors. // If we have VLX, we want to use the narrowest vector that can get the // job done so we use vXi32. MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts); MVT ExtVT = MVT::getVectorVT(EltVT, NumElts); In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; ShiftInx = InVT.getScalarSizeInBits() - 1; } if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { // We need to shift to get the lsb into sign position. In = DAG.getNode(ISD::SHL, DL, InVT, In, DAG.getConstant(ShiftInx, DL, InVT)); } // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m. if (Subtarget.hasDQI()) return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT); return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); unsigned InNumEltBits = InVT.getScalarSizeInBits(); assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); // If called by the legalizer just return. if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) return SDValue(); if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget.hasAVX512()) { // word to byte only under BWI. Otherwise we have to promoted to v16i32 // and then truncate that. But we should only do that if we haven't been // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be // handled by isel patterns. if (InVT != MVT::v16i16 || Subtarget.hasBWI() || Subtarget.canExtendTo512DQ()) return Op; } unsigned NumPackedSignBits = std::min(VT.getScalarSizeInBits(), 16); unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; // Truncate with PACKUS if we are truncating a vector with leading zero bits // that extend all the way to the packed/truncated value. // Pre-SSE41 we can only use PACKUSWB. KnownBits Known = DAG.computeKnownBits(In); if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget)) return V; // Truncate with PACKSS if we are truncating a vector with sign-bits that // extend all the way to the packed/truncated value. if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In)) if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) return V; if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v8i32, In); In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0, DL)); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(2, DL)); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); static const int ShufMask[] = {0, 2, 4, 6}; return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { In = DAG.getBitcast(MVT::v32i8, In); // The PSHUFB mask: static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 16, 17, 20, 21, 24, 25, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1 }; In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); static const int ShufMask2[] = {0, 2, -1, -1}; In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(VT, In); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(4, DL)); OpLo = DAG.getBitcast(MVT::v16i8, OpLo); OpHi = DAG.getBitcast(MVT::v16i8, OpHi); // The PSHUFB mask: static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1}; OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1); OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); // The MOVLHPS Mask: static const int ShufMask2[] = {0, 1, 4, 5}; SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); return DAG.getBitcast(MVT::v8i16, res); } if (VT == MVT::v16i8 && InVT == MVT::v16i16) { // Use an AND to zero uppper bits for PACKUS. In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT)); SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, DAG.getIntPtrConstant(0, DL)); SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, DAG.getIntPtrConstant(8, DL)); return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi); } // Handle truncation of V256 to V128 using shuffles. assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); assert(Subtarget.hasAVX() && "256-bit vector without AVX!"); unsigned NumElems = VT.getVectorNumElements(); MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); SmallVector MaskVec(NumElems * 2, -1); // Prepare truncation shuffle mask for (unsigned i = 0; i != NumElems; ++i) MaskVec[i] = i * 2; In = DAG.getBitcast(NVT, In); SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, DAG.getIntPtrConstant(0, DL)); } SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; MVT VT = Op.getSimpleValueType(); SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; if (!IsSigned && !Subtarget.hasVLX()) { // Widen to 512-bits. ResVT = MVT::v8i32; TruncVT = MVT::v8i1; Opc = ISD::FP_TO_UINT; Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, DAG.getUNDEF(MVT::v8f64), Src, DAG.getIntPtrConstant(0, dl)); } SDValue Res = DAG.getNode(Opc, dl, ResVT, Src); Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, DAG.getIntPtrConstant(0, dl)); } assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32))); } return SDValue(); } assert(!VT.isVector()); bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); if (!IsSigned && Subtarget.hasAVX512()) { // Conversions from f32/f64 should be legal. if (UseSSEReg) return Op; // Use default expansion. if (VT == MVT::i64) return SDValue(); } // Promote i16 to i32 if we can use a SSE operation. if (VT == MVT::i16 && UseSSEReg) { assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"); SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } // If this is a SINT_TO_FP using SSEReg we're done. if (UseSSEReg && IsSigned) return Op; // Fall back to X87. if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned)) return V; llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); return DAG.getNode(X86ISD::VFPEXT, DL, VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT))); } /// Horizontal vector math instructions may be slower than normal math with /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If both operands have other uses, this is probably not profitable. SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); if (!LHS.hasOneUse() && !RHS.hasOneUse()) return Op; // FP horizontal add/sub were added with SSE3. Integer with SSSE3. bool IsFP = Op.getSimpleValueType().isFloatingPoint(); if (IsFP && !Subtarget.hasSSE3()) return Op; if (!IsFP && !Subtarget.hasSSSE3()) return Op; // Extract from a common vector. if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getOperand(0) != RHS.getOperand(0) || !isa(LHS.getOperand(1)) || !isa(RHS.getOperand(1)) || !shouldUseHorizontalOp(true, DAG, Subtarget)) return Op; // Allow commuted 'hadd' ops. // TODO: Allow commuted (f)sub by negating the result of (F)HSUB? unsigned HOpcode; switch (Op.getOpcode()) { case ISD::ADD: HOpcode = X86ISD::HADD; break; case ISD::SUB: HOpcode = X86ISD::HSUB; break; case ISD::FADD: HOpcode = X86ISD::FHADD; break; case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; default: llvm_unreachable("Trying to lower unsupported opcode to horizontal op"); } unsigned LExtIndex = LHS.getConstantOperandVal(1); unsigned RExtIndex = RHS.getConstantOperandVal(1); if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 && (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD)) std::swap(LExtIndex, RExtIndex); if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1)) return Op; SDValue X = LHS.getOperand(0); EVT VecVT = X.getValueType(); unsigned BitWidth = VecVT.getSizeInBits(); unsigned NumLanes = BitWidth / 128; unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes; assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && "Not expecting illegal vector widths here"); // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit // equivalent, so extract the 256/512-bit source op to 128-bit if we can. SDLoc DL(Op); if (BitWidth == 256 || BitWidth == 512) { unsigned LaneIdx = LExtIndex / NumEltsPerLane; X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL); LExtIndex %= NumEltsPerLane; } // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp, DAG.getIntPtrConstant(LExtIndex / 2, DL)); } /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Only expecting float/double"); return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); } /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."); bool IsFABS = (Op.getOpcode() == ISD::FABS); // If this is a FABS and it has an FNEG user, bail out to fold the combination // into an FNABS. We'll lower the FABS after that if it is still in use. if (IsFABS) for (SDNode *User : Op->uses()) if (User->getOpcode() == ISD::FNEG) return Op; SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); bool IsF128 = (VT == MVT::f128); assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"); // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. // Using a 16-byte mask allows folding the load of the mask with // the logic op, so it can save (~4 bytes) on code size. bool IsFakeVector = !VT.isVector() && !IsF128; MVT LogicVT = VT; if (IsFakeVector) LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; unsigned EltBits = VT.getScalarSizeInBits(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits); const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); unsigned LogicOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, // and extract the scalar result back out. Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDValue Mag = Op.getOperand(0); SDValue Sign = Op.getOperand(1); SDLoc dl(Op); // If the sign operand is smaller, extend it first. MVT VT = Op.getSimpleValueType(); if (Sign.getSimpleValueType().bitsLT(VT)) Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign); // And if it is bigger, shrink it first. if (Sign.getSimpleValueType().bitsGT(VT)) Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl)); // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. bool IsF128 = (VT == MVT::f128); assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); // Perform all scalar logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. // TODO: This isn't necessary. If we used scalar types, we might avoid some // unnecessary splats, but we might miss load folding opportunities. Should // this decision be based on OptimizeForSize? bool IsFakeVector = !VT.isVector() && !IsF128; MVT LogicVT = VT; if (IsFakeVector) LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; // The mask constants are automatically splatted for vector types. unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue SignMask = DAG.getConstantFP( APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT); SDValue MagMask = DAG.getConstantFP( APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT); // First, clear all bits but the sign bit from the second operand (sign). if (IsFakeVector) Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask); // Next, clear the sign bit from the first operand (magnitude). // TODO: If we had general constant folding for FP logic ops, this check // wouldn't be necessary. SDValue MagBits; if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) { APFloat APF = Op0CN->getValueAPF(); APF.clearSign(); MagBits = DAG.getConstantFP(APF, dl, LogicVT); } else { // If the magnitude operand wasn't a constant, we need to AND out the sign. if (IsFakeVector) Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag); MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask); } // OR the magnitude value with the sign bit. SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit); return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT OpVT = N0.getSimpleValueType(); assert((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"); // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1). MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0); Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res); Res = DAG.getZExtOrTrunc(Res, dl, VT); Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT)); return Res; } /// Helper for creating a X86ISD::SETCC node. static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, SelectionDAG &DAG) { return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(Cond, dl, MVT::i8), EFLAGS); } /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...)) /// style scalarized (associative) reduction patterns. static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl &SrcOps) { SmallVector Opnds; DenseMap SrcOpMap; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to // test all 0s. assert(Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"); Opnds.push_back(Op.getOperand(0)); Opnds.push_back(Op.getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { SmallVectorImpl::const_iterator I = Opnds.begin() + Slot; // BFS traverse all BinOp operands. if (I->getOpcode() == unsigned(BinOp)) { Opnds.push_back(I->getOperand(0)); Opnds.push_back(I->getOperand(1)); // Re-evaluate the number of nodes to be traversed. e += 2; // 2 more nodes (LHS and RHS) are pushed. continue; } // Quit if a non-EXTRACT_VECTOR_ELT if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return false; // Quit if without a constant index. SDValue Idx = I->getOperand(1); if (!isa(Idx)) return false; SDValue Src = I->getOperand(0); DenseMap::iterator M = SrcOpMap.find(Src); if (M == SrcOpMap.end()) { VT = Src.getValueType(); // Quit if not the same type. if (SrcOpMap.begin() != SrcOpMap.end() && VT != SrcOpMap.begin()->first.getValueType()) return false; unsigned NumElts = VT.getVectorNumElements(); APInt EltCount = APInt::getNullValue(NumElts); M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first; SrcOps.push_back(Src); } // Quit if element already used. unsigned CIdx = cast(Idx)->getZExtValue(); if (M->second[CIdx]) return false; M->second.setBit(CIdx); } // Quit if not all elements are used. for (DenseMap::const_iterator I = SrcOpMap.begin(), E = SrcOpMap.end(); I != E; ++I) { if (!I->second.isAllOnesValue()) return false; } return true; } // Check whether an OR'd tree is PTEST-able. static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &X86CC) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); if (!Subtarget.hasSSE41() || !Op->hasOneUse()) return SDValue(); SmallVector VecIns; if (!matchBitOpReduction(Op, ISD::OR, VecIns)) return SDValue(); // Quit if not 128/256-bit vector. EVT VT = VecIns[0].getValueType(); if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); SDLoc DL(Op); MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. for (unsigned i = 0, e = VecIns.size(); i < e; ++i) VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); // If more than one full vector is evaluated, OR them first before PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { // Each iteration will OR 2 nodes and append the result until there is only // 1 node left, i.e. the final OR'd value of all vectors. SDValue LHS = VecIns[Slot]; SDValue RHS = VecIns[Slot + 1]; VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL, MVT::i8); return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// return true if \c Op has a use that doesn't just read flags. static bool hasNonFlagsUse(SDValue Op) { for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; ++UI) { SDNode *User = *UI; unsigned UOpNo = UI.getOperandNo(); if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { // Look pass truncate. UOpNo = User->use_begin().getOperandNo(); User = *User->use_begin(); } if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) return true; } return false; } /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; bool NeedOF = false; switch (X86CC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: NeedCF = true; break; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: case X86::COND_O: case X86::COND_NO: { // Check if we really need to set the // Overflow flag. If NoSignedWrap is present // that is not actually needed. switch (Op->getOpcode()) { case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::SHL: if (Op.getNode()->getFlags().hasNoSignedWrap()) break; LLVM_FALLTHROUGH; default: NeedOF = true; break; } break; } } // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } unsigned Opcode = 0; unsigned NumOperands = 0; SDValue ArithOp = Op; // NOTICE: In the code below we use ArithOp to hold the arithmetic operation // which may be the result of a CAST. We use the variable 'Op', which is the // non-casted variable when we check for possible users. switch (ArithOp.getOpcode()) { case ISD::AND: // If the primary 'and' result isn't used, don't bother using X86ISD::AND, // because a TEST instruction will be better. if (!hasNonFlagsUse(Op)) break; LLVM_FALLTHROUGH; case ISD::ADD: case ISD::SUB: case ISD::OR: case ISD::XOR: // Transform to an x86-specific ALU node with flags if there is a chance of // using an RMW op or only the flags are used. Otherwise, leave // the node alone and emit a 'test' instruction. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC && UI->getOpcode() != ISD::STORE) goto default_case; // Otherwise use a regular EFLAGS-setting instruction. switch (ArithOp.getOpcode()) { default: llvm_unreachable("unexpected operator!"); case ISD::ADD: Opcode = X86ISD::ADD; break; case ISD::SUB: Opcode = X86ISD::SUB; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: Opcode = X86ISD::OR; break; } NumOperands = 2; break; case X86ISD::ADD: case X86ISD::SUB: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: return SDValue(Op.getNode(), 1); default: default_case: break; } if (Opcode == 0) { // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector Ops(Op->op_begin(), Op->op_begin() + NumOperands); SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New); return SDValue(New.getNode(), 1); } /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG) const { if (isNullConstant(Op1)) return EmitTest(Op0, X86CC, dl, DAG, Subtarget); EVT CmpVT = Op0.getValueType(); if (CmpVT.isFloatingPoint()) return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); // Only promote the compare up to I32 if it is a 16 bit operation // with an immediate. 16 bit immediates are to be avoided. if (CmpVT == MVT::i16 && !Subtarget.isAtom() && !DAG.getMachineFunction().getFunction().hasMinSize()) { ConstantSDNode *COp0 = dyn_cast(Op0); ConstantSDNode *COp1 = dyn_cast(Op1); // Don't do this if the immediate can fit in 8-bits. if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) || (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; if (X86CC == X86::COND_E || X86CC == X86::COND_NE) { // For equality comparisons try to use SIGN_EXTEND if the input was // truncate from something with enough sign bits. if (Op0.getOpcode() == ISD::TRUNCATE) { SDValue In = Op0.getOperand(0); unsigned EffBits = In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; if (EffBits <= 16) ExtendOp = ISD::SIGN_EXTEND; } else if (Op1.getOpcode() == ISD::TRUNCATE) { SDValue In = Op1.getOperand(0); unsigned EffBits = In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; if (EffBits <= 16) ExtendOp = ISD::SIGN_EXTEND; } } CmpVT = MVT::i32; Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0); Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1); } } // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); return Sub.getValue(1); } /// Convert a comparison if required by the subtarget. SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const { // If the subtarget does not support the FUCOMI instruction, floating-point // comparisons have to be converted. if (Subtarget.hasCMov() || Cmp.getOpcode() != X86ISD::CMP || !Cmp.getOperand(0).getValueType().isFloatingPoint() || !Cmp.getOperand(1).getValueType().isFloatingPoint()) return Cmp; // The instruction selector will select an FUCOM instruction instead of // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence // build an SDNode sequence that transfers the result from FPSW into EFLAGS: // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) SDLoc dl(Cmp); SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); // Some 64-bit targets lack SAHF support, but they do support FCOMI. assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } /// Check if replacement of SQRT with RSQRT should be disabled. bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // We never want to use both SQRT and RSQRT instructions for the same input. if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) return false; if (VT.isVector()) return Subtarget.hasFastVectorFSQRT(); return Subtarget.hasFastScalarFSQRT(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const { EVT VT = Op.getValueType(); // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32 // after legalize types. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) || (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) || (VT == MVT::v8f32 && Subtarget.hasAVX()) || (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; UseOneConstNR = false; // There is no FSQRT for 512-bits, but there is RSQRT14. unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT; return DAG.getNode(Opcode, SDLoc(Op), VT, Op); } return SDValue(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const { EVT VT = Op.getValueType(); // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // It is likely not profitable to do this for f64 because a double-precision // reciprocal estimate with refinement on x86 prior to FMA requires // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1()) || (VT == MVT::v8f32 && Subtarget.hasAVX()) || (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { // Enable estimate codegen with 1 refinement step for vector division. // Scalar division estimates are disabled because they break too much // real-world code. These defaults are intended to match GCC behavior. if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified) return SDValue(); if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; // There is no FSQRT for 512-bits, but there is RCP14. unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP; return DAG.getNode(Opcode, SDLoc(Op), VT, Op); } return SDValue(); } /// If we have at least two divisions that use the same divisor, convert to /// multiplication by a reciprocal. This may need to be adjusted for a given /// CPU if a division's cost is not at least twice the cost of a multiplication. /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } /// Result of 'and' is compared against zero. Change to a BT node if possible. /// Returns the BT node and the condition code needed to use it. static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, SDValue &X86CC) { assert(And.getOpcode() == ISD::AND && "Expected AND node!"); SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) Op0 = Op0.getOperand(0); if (Op1.getOpcode() == ISD::TRUNCATE) Op1 = Op1.getOperand(0); SDValue Src, BitNo; if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { KnownBits Known = DAG.computeKnownBits(Op0); if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth) return SDValue(); } Src = Op1; BitNo = Op0.getOperand(1); } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast(Op1); uint64_t AndRHSVal = AndRHS->getZExtValue(); SDValue AndLHS = Op0; if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { Src = AndLHS.getOperand(0); BitNo = AndLHS.getOperand(1); } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, Src.getValueType()); } } } // No patterns found, give up. if (!Src.getNode()) return SDValue(); // If Src is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because // the encoding for the i16 version is larger than the i32 version. // Also promote i16 to i32 for performance / code size reason. if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16) Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src); // See if we can use the 32-bit instruction instead of the 64-bit one for a // shorter encoding. Since the former takes the modulo 32 of BitNo and the // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is // known to be zero. if (Src.getValueType() == MVT::i64 && DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32))) Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); // If the operand types disagree, extend the shift amount to match. Since // BT ignores high bits (like shifts) we can use anyextend. if (Src.getValueType() != BitNo.getValueType()) BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo); X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B, dl, MVT::i8); return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo); } /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask /// CMPs. static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1) { unsigned SSECC; bool Swap = false; // SSE Condition code mapping: // 0 - EQ // 1 - LT // 2 - LE // 3 - UNORD // 4 - NEQ // 5 - NLT // 6 - NLE // 7 - ORD switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; case ISD::SETOGT: case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLT: case ISD::SETOLT: SSECC = 1; break; case ISD::SETOGE: case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLE: case ISD::SETOLE: SSECC = 2; break; case ISD::SETUO: SSECC = 3; break; case ISD::SETUNE: case ISD::SETNE: SSECC = 4; break; case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGE: SSECC = 5; break; case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; case ISD::SETUEQ: SSECC = 8; break; case ISD::SETONE: SSECC = 12; break; } if (Swap) std::swap(Op0, Op1); return SSECC; } /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then /// concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); SDValue CC = Op.getOperand(2); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert(VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); // If this is a seteq make sure any build vectors of all zeros are on the RHS. // This helps with vptestm matching. // TODO: Should we just canonicalize the setcc during DAG combine? if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) && ISD::isBuildVectorAllZeros(Op0.getNode())) std::swap(Op0, Op1); // Prefer SETGT over SETLT. if (SetCCOpcode == ISD::SETLT) { SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode); std::swap(Op0, Op1); } return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); } /// Given a buildvector constant, return a new vector constant with each element /// incremented or decremented. If incrementing or decrementing would result in /// unsigned overflow or underflow or this is not a simple vector constant, /// return an empty value. static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) { auto *BV = dyn_cast(V.getNode()); if (!BV) return SDValue(); MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); SmallVector NewVecC; SDLoc DL(V); for (unsigned i = 0; i < NumElts; ++i) { auto *Elt = dyn_cast(BV->getOperand(i)); if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT) return SDValue(); // Avoid overflow/underflow. const APInt &EltC = Elt->getAPIntValue(); if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue())) return SDValue(); NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT)); } return DAG.getBuildVector(VT, DL, NewVecC); } /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for /// Op0 u<= Op1: /// t = psubus Op0, Op1 /// pcmpeq t, <0..0> static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (!Subtarget.hasSSE2()) return SDValue(); MVT VET = VT.getVectorElementType(); if (VET != MVT::i8 && VET != MVT::i16) return SDValue(); switch (Cond) { default: return SDValue(); case ISD::SETULT: { // If the comparison is against a constant we can turn this into a // setule. With psubus, setule does not require a swap. This is // beneficial because the constant in the register is no longer // destructed as the destination so it can be hoisted out of a loop. // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; break; } case ISD::SETUGT: { // If the comparison is against a constant, we can turn this into a setuge. // This is beneficial because materializing a constant 0 for the PCMPEQ is // probably cheaper than XOR+PCMPGT using 2 different vector constants: // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true); if (!UGEOp1) return SDValue(); Op1 = Op0; Op0 = UGEOp1; break; } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: std::swap(Op0, Op1); break; case ISD::SETULE: break; } SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1); return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, DAG.getConstant(0, dl, VT)); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); ISD::CondCode Cond = cast(CC)->get(); bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); if (isFP) { #ifndef NDEBUG MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif unsigned Opc; if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16); Opc = X86ISD::CMPM; } else { Opc = X86ISD::CMPP; // The SSE/AVX packed FP comparison nodes are defined with a // floating-point vector result that matches the operand type. This allows // them to work with an SSE1 target (integer vector types are not legal). VT = Op0.getSimpleValueType(); } // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), // emit two comparisons and a logic op to tie them together. SDValue Cmp; unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1); if (SSECC >= 8 && !Subtarget.hasAVX()) { // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; if (Cond == ISD::SETUEQ) { CC0 = 3; // UNORD CC1 = 0; // EQ CombineOpc = X86ISD::FOR; } else { assert(Cond == ISD::SETONE); CC0 = 7; // ORD CC1 = 4; // NEQ CombineOpc = X86ISD::FAND; } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC0, dl, MVT::i8)); SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC1, dl, MVT::i8)); Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { // Handle all other FP comparisons here. Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, dl, MVT::i8)); } // If this is SSE/AVX CMPP, bitcast the result back to integer to match the // result type of SETCC. The bitcast is expected to be optimized away // during combining/isel. if (Opc == X86ISD::CMPP) Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); return Cmp; } MVT VTOp0 = Op0.getSimpleValueType(); assert(VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"); assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); // This is being called by type legalization because v2i32 is marked custom // for result type legalization for v2f32. if (VTOp0 == MVT::v2i32) return SDValue(); // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"); // The result is boolean, but operands are int/float if (VT.getVectorElementType() == MVT::i1) { // In AVX-512 architecture setcc returns mask with i1 elements, // But there is no compare instruction for i8 and i16 elements in KNL. assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && "Unexpected operand type"); return LowerIntVSETCC_AVX512(Op, DAG); } // Lower using XOP integer comparisons. if (VT.is128BitVector() && Subtarget.hasXOP()) { // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; switch (Cond) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETULT: case ISD::SETLT: CmpMode = 0x00; break; case ISD::SETULE: case ISD::SETLE: CmpMode = 0x01; break; case ISD::SETUGT: case ISD::SETGT: CmpMode = 0x02; break; case ISD::SETUGE: case ISD::SETGE: CmpMode = 0x03; break; case ISD::SETEQ: CmpMode = 0x04; break; case ISD::SETNE: CmpMode = 0x05; break; } // Are we comparing unsigned or signed integers? unsigned Opc = ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CmpMode, dl, MVT::i8)); } // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. // Revert part of the simplifySetCCWithAnd combine, to avoid an invert. if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) { SDValue BC0 = peekThroughBitcasts(Op0); if (BC0.getOpcode() == ISD::AND) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits, false, false)) { if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) { Cond = ISD::SETEQ; Op1 = DAG.getBitcast(VT, BC0.getOperand(1)); } } } } // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2. if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND && Op0.getOperand(1) == Op1 && Op0.hasOneUse()) { ConstantSDNode *C1 = isConstOrConstSplat(Op1); if (C1 && C1->getAPIntValue().isPowerOf2()) { unsigned BitWidth = VT.getScalarSizeInBits(); unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1; SDValue Result = Op0.getOperand(0); Result = DAG.getNode(ISD::SHL, dl, VT, Result, DAG.getConstant(ShiftAmt, dl, VT)); Result = DAG.getNode(ISD::SRA, dl, VT, Result, DAG.getConstant(BitWidth - 1, dl, VT)); return Result; } } // Break 256-bit integer vector compare into smaller ones. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntVSETCC(Op, DAG); // If this is a SETNE against the signed minimum value, change it to SETGT. // If this is a SETNE against the signed maximum value, change it to SETLT. // which will be swapped to SETGT. // Otherwise we use PCMPEQ+invert. APInt ConstValue; if (Cond == ISD::SETNE && ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) { if (ConstValue.isMinSignedValue()) Cond = ISD::SETGT; else if (ConstValue.isMaxSignedValue()) Cond = ISD::SETLT; } // If both operands are known non-negative, then an unsigned compare is the // same as a signed compare and there's no need to flip signbits. // TODO: We could check for more general simplifications here since we're // computing known bits. bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) && !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1)); // Special case: Use min/max operations for unsigned compares. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (ISD::isUnsignedIntSetCC(Cond) && (FlipSigns || ISD::isTrueWhenEqual(Cond)) && TLI.isOperationLegal(ISD::UMIN, VT)) { // If we have a constant operand, increment/decrement it and change the // condition to avoid an invert. if (Cond == ISD::SETUGT && ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { return !C->getAPIntValue().isMaxValue(); })) { // X > C --> X >= (C+1) --> X == umax(X, C+1) Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT)); Cond = ISD::SETUGE; } if (Cond == ISD::SETULT && ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { return !C->getAPIntValue().isNullValue(); })) { // X < C --> X <= (C-1) --> X == umin(X, C-1) Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT)); Cond = ISD::SETULE; } bool Invert = false; unsigned Opc; switch (Cond) { default: llvm_unreachable("Unexpected condition code"); case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH; case ISD::SETULE: Opc = ISD::UMIN; break; case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH; case ISD::SETUGE: Opc = ISD::UMAX; break; } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } // Try to use SUBUS and PCMPEQ. if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG)) return V; // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ : X86ISD::PCMPGT; bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || Cond == ISD::SETGE || Cond == ISD::SETUGE; bool Invert = Cond == ISD::SETNE || (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) { assert(Subtarget.hasSSE2() && "Don't know how to lower!"); // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower // compare is always unsigned. SDValue SB; if (FlipSigns) { SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64); } else { SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64); } Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB); // Cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); // Create masks for only the low parts/high parts of the 64 bit integers. static const int MaskHi[] = { 1, 1, 3, 3 }; static const int MaskLo[] = { 0, 0, 2, 2 }; SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) { // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with // pcmpeqd + pshufd + pand. assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Do the compare. SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. static const int Mask[] = { 1, 0, 3, 2 }; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { MVT EltVT = VT.getVectorElementType(); SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM); Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } // Try to select this as a KORTEST+SETCC if possible. static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC) { // Only support equality comparisons. if (CC != ISD::SETEQ && CC != ISD::SETNE) return SDValue(); // Must be a bitcast from vXi1. if (Op0.getOpcode() != ISD::BITCAST) return SDValue(); Op0 = Op0.getOperand(0); MVT VT = Op0.getSimpleValueType(); if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) && !(Subtarget.hasDQI() && VT == MVT::v8i1) && !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))) return SDValue(); X86::CondCode X86Cond; if (isNullConstant(Op1)) { X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; } else if (isAllOnesConstant(Op1)) { // C flag is set for all ones. X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE; } else return SDValue(); // If the input is an OR, we can combine it's operands into the KORTEST. SDValue LHS = Op0; SDValue RHS = Op0; if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) { LHS = Op0.getOperand(0); RHS = Op0.getOperand(1); } X86CC = DAG.getConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); } /// Emit flags for the given setcc condition and operands. Also returns the /// corresponding X86 condition code constant in X86CC. SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, SDValue &X86CC) const { // Optimize to BT if possible. // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC)) return BT; } // Try to use PTEST for a tree ORs equality compared with 0. // TODO: We could do AND tree with all 1s as well by using the C flag. if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC)) return PTEST; } // Try to lower using KORTEST. if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) return KORTEST; // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. if ((isOneConstant(Op1) || isNullConstant(Op1)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); X86CC = Op0.getOperand(0); if (Invert) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); X86CC = DAG.getConstant(CCode, dl, MVT::i8); } return Op0.getOperand(1); } } bool IsFP = Op1.getSimpleValueType().isFloatingPoint(); X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG); if (CondCode == X86::COND_INVALID) return SDValue(); SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); X86CC = DAG.getConstant(CondCode, dl, MVT::i8); return EFLAGS; } SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDValue X86CC; SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC); if (!EFLAGS) return SDValue(); return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); } SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); X86::CondCode CC = TranslateIntegerX86CC(cast(Cond)->get()); // Recreate the carry if needed. EVT CarryVT = Carry.getValueType(); APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, DAG.getConstant(NegOne, DL, CarryVT)); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); return getSETCC(CC, Cmp.getValue(1), DL, DAG); } // This function returns three things: the arithmetic computation itself // (Value), an EFLAGS result (Overflow), and a condition code (Cond). The // flag and the condition code define the case in which the arithmetic // computation overflows. static std::pair getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) { assert(Op.getResNo() == 0 && "Unexpected result number!"); SDValue Value, Overflow; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); unsigned BaseOp = 0; SDLoc DL(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: BaseOp = X86ISD::ADD; Cond = X86::COND_O; break; case ISD::UADDO: BaseOp = X86ISD::ADD; Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B; break; case ISD::SSUBO: BaseOp = X86ISD::SUB; Cond = X86::COND_O; break; case ISD::USUBO: BaseOp = X86ISD::SUB; Cond = X86::COND_B; break; case ISD::SMULO: BaseOp = X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: BaseOp = X86ISD::UMUL; Cond = X86::COND_O; break; } if (BaseOp) { // Also sets EFLAGS. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); Overflow = Value.getValue(1); } return std::make_pair(Value, Overflow); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" // has only one use. SDLoc DL(Op); X86::CondCode Cond; SDValue Value, Overflow; std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG); SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG); assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!"); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC); } /// Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getOpcode(); if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || Opc == X86ISD::SAHF) return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL || Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND)) return true; return false; } static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; SDValue VOp0 = V.getOperand(0); unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool AddTest = true; SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); MVT VT = Op1.getSimpleValueType(); SDValue CC; // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && ((Subtarget.hasSSE2() && VT == MVT::f64) || (Subtarget.hasSSE1() && VT == MVT::f32)) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); unsigned SSECC = translateX86FSETCC( cast(Cond.getOperand(2))->get(), CondOp0, CondOp1); if (Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); assert(!VT.isVector() && "Not a scalar type?"); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } if (SSECC < 8 || Subtarget.hasAVX()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); // If we have AVX, we can use a variable vector select (VBLENDV) instead // of 3 logic instructions for size savings and potentially speed. // Unfortunately, there is no scalar form of VBLENDV. // If either operand is a +0.0 constant, don't try this. We can expect to // optimize away at least one of the logic instructions later in that // case, so that sequence would be faster than a variable blend. // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. if (Subtarget.hasAVX() && !isNullFPConstant(Op1) && !isNullFPConstant(Op2)) { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0, DL)); } SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); } } // AVX512 fallback is to lower selects of scalar floats to masked moves. if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } // For v64i1 without 64-bit support we need to split and rejoin. if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { assert(Subtarget.hasBWI() && "Expected BWI to be legal"); SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32); SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32); SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32); SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32); SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo); SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) Op1Scalar = Op1.getOperand(0); SDValue Op2Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) Op2Scalar = Op2.getOperand(0); if (Op1Scalar.getNode() && Op2Scalar.getNode()) { SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond, Op1Scalar, Op2Scalar); if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, newSelect); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, DAG.getIntPtrConstant(0, DL)); } } if (Cond.getOpcode() == ISD::SETCC) { if (SDValue NewCond = LowerSETCC(Cond, DAG)) { Cond = NewCond; // If the condition was updated, it's possible that the operands of the // select were also updated (for example, EmitTest has a RAUW). Refresh // the local references to the select operands in case they got stale. Op1 = Op.getOperand(1); Op2 = Op.getOperand(2); } } // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode = cast(Cond.getOperand(0))->getZExtValue(); if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb if (isNullConstant(Y) && (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); Zero = DAG.getConstant(0, DL, Op.getValueType()); return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero); } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); Cmp = ConvertCmpIfNecessary(Cmp, DAG); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue Zero = DAG.getConstant(0, DL, Op.getValueType()); SDValue Res = // Res = 0 or -1. DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E && Cmp.getOperand(0).getOpcode() == ISD::AND && isOneConstant(Cmp.getOperand(0).getOperand(1))) { SDValue CmpOp0 = Cmp.getOperand(0); SDValue Src1, Src2; // true if Op2 is XOR or OR operator and one of its operands // is equal to Op1 // ( a , a op b) || ( b , a op b) auto isOrXorPattern = [&]() { if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) && (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) { Src1 = Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0); Src2 = Op1; return true; } return false; }; if (isOrXorPattern()) { SDValue Neg; unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits(); // we need mask of all zeros or ones with same size of the other // operands. if (CmpSz > VT.getSizeInBits()) Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0); else if (CmpSz < VT.getSizeInBits()) Neg = DAG.getNode(ISD::AND, DL, VT, DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)), DAG.getConstant(1, DL, VT)); else Neg = CmpOp0; SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Neg); // -(and (x, 0x1)) SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y } } } // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && isOneConstant(Cond.getOperand(1))) Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); MVT VT = Op.getSimpleValueType(); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT)) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || Cmp.getOpcode() == X86ISD::BT) { // FIXME Cond = Cmp; AddTest = false; } } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { SDValue Value; X86::CondCode X86Cond; std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); CC = DAG.getConstant(X86Cond, DL, MVT::i8); AddTest = false; } if (AddTest) { // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { SDValue BTCC; if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) { CC = BTCC; Cond = BT; AddTest = false; } } } if (AddTest) { CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()), X86::COND_NE, DL, DAG); } // a < b ? -1 : 0 -> RES = ~setcc_carry // a < b ? 0 : -1 -> RES = setcc_carry // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == X86ISD::SUB) { Cond = ConvertCmpIfNecessary(Cond, DAG); unsigned CondCode = cast(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cond); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } } // X86 doesn't have an i8 cmov. If both operands are the result of a truncate // widen the cmov and push the truncate through. This avoids introducing a new // branch during isel and doesn't add any extensions. if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); if (T1.getValueType() == T2.getValueType() && // Blacklist CopyFromReg to avoid partial register stalls. T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, CC, Cond); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } } // Or finally, promote i8 cmovs if we have CMOV, // or i16 cmovs if it won't prevent folding a load. // FIXME: we should not limit promotion of i8 case to only when the CMOV is // legal, but EmitLoweredSelect() can not deal with these extensions // being inserted between two CMOV's. (in i16 case too TBN) // https://bugs.llvm.org/show_bug.cgi?id=40974 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) || (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); SDValue Ops[] = { Op2, Op1, CC, Cond }; SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. SDValue Ops[] = { Op2, Op1, CC, Cond }; return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops); } static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); MVT VTElt = VT.getVectorElementType(); SDLoc dl(Op); unsigned NumElts = VT.getVectorNumElements(); // Extend VT if the scalar type is i8/i16 and BWI is not supported. MVT ExtVT = VT; if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) { // If v16i32 is to be avoided, we'll need to split and concatenate. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG); ExtVT = MVT::getVectorVT(MVT::i32, NumElts); } // Widen to 512-bits if VLX is not supported. MVT WideVT = ExtVT; if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { NumElts *= 512 / ExtVT.getSizeInBits(); InVT = MVT::getVectorVT(MVT::i1, NumElts); In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In, DAG.getIntPtrConstant(0, dl)); WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); } SDValue V; MVT WideEltVT = WideVT.getVectorElementType(); if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) || (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) { V = DAG.getNode(Op.getOpcode(), dl, WideVT, In); } else { SDValue NegOne = DAG.getConstant(-1, dl, WideVT); SDValue Zero = DAG.getConstant(0, dl, WideVT); V = DAG.getSelect(dl, WideVT, In, NegOne, Zero); } // Truncate if we had to extend i16/i8 above. if (VT != ExtVT) { WideVT = MVT::getVectorVT(VTElt, NumElts); V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V); } // Extract back to 128/256-bit if we widened. if (WideVT != VT) V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V, DAG.getIntPtrConstant(0, dl)); return V; } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); if (InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); assert(Subtarget.hasAVX() && "Expected AVX support"); return LowerAVXExtend(Op, DAG, Subtarget); } // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. // For sign extend this needs to handle all vector sizes and SSE4.1 and // non-SSE4.1 targets. For zero extend this should only handle inputs of // MVT::v64i8 when BWI is not supported, but AVX512 is. static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); MVT SVT = VT.getVectorElementType(); MVT InSVT = InVT.getVectorElementType(); assert(SVT.getSizeInBits() > InSVT.getSizeInBits()); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) return SDValue(); if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && !(VT.is256BitVector() && Subtarget.hasAVX()) && !(VT.is512BitVector() && Subtarget.hasAVX512())) return SDValue(); SDLoc dl(Op); unsigned Opc = Op.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); // For 256-bit vectors, we only need the lower (128-bit) half of the input. // For 512-bit vectors, we need 128-bits or 256-bits. if (InVT.getSizeInBits() > 128) { // Input needs to be at least the same number of elements as output, and // at least 128-bits. int InSize = InSVT.getSizeInBits() * NumElts; In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); InVT = In.getSimpleValueType(); } // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results, // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still // need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); if (InVT.getVectorNumElements() != NumElts) return DAG.getNode(Op.getOpcode(), dl, VT, In); // FIXME: Apparently we create inreg operations that could be regular // extends. unsigned ExtOpc = Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, dl, VT, In); } // pre-AVX2 256-bit extensions need to be split into 128-bit instructions. if (Subtarget.hasAVX()) { assert(VT.is256BitVector() && "256-bit vector expected"); int HalfNumElts = NumElts / 2; MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts); unsigned NumSrcElts = InVT.getVectorNumElements(); SmallVector HiMask(NumSrcElts, SM_SentinelUndef); for (int i = 0; i != HalfNumElts; ++i) HiMask[i] = HalfNumElts + i; SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In); SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask); Hi = DAG.getNode(Opc, dl, HalfVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); } // We should only get here for sign extend. assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"); assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs"); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; SDValue SignExt = Curr; // As SRAI is only available on i16/i32 types, we expand only up to i32 // and handle i64 separately. if (InVT != MVT::v4i32) { MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT; unsigned DestWidth = DestVT.getScalarSizeInBits(); unsigned Scale = DestWidth / InSVT.getSizeInBits(); unsigned InNumElts = InVT.getVectorNumElements(); unsigned DestElts = DestVT.getVectorNumElements(); // Build a shuffle mask that takes each input element and places it in the // MSBs of the new element size. SmallVector Mask(InNumElts, SM_SentinelUndef); for (unsigned i = 0; i != DestElts; ++i) Mask[i * Scale + (Scale - 1)] = i; Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask); Curr = DAG.getBitcast(DestVT, Curr); unsigned SignExtShift = DestWidth - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr, DAG.getConstant(SignExtShift, dl, MVT::i8)); } if (VT == MVT::v2i64) { assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT"); SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT); SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5}); SignExt = DAG.getBitcast(VT, SignExt); } return SignExt; } static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); if (InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); assert(VT.getVectorNumElements() == VT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && "Unexpected element type"); assert((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In); } if (Subtarget.hasInt256()) return Op; // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and // v4i32 to v4i64 // // Divide input vector into two parts // for v4i32 the high shuffle mask will be {2, 3, -1, -1} // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 // concat the vectors to original VT MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In); unsigned NumElems = InVT.getVectorNumElements(); SmallVector ShufMask(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask[i] = i + NumElems/2; SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } /// Change a vector store into a pair of half-size vector stores. static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { SDValue StoredVal = Store->getValue(); assert((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"); // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. We are assuming the input op is legal (this transform // is only used for targets with AVX). if (Store->isVolatile()) return SDValue(); MVT StoreVT = StoredVal.getSimpleValueType(); unsigned NumElems = StoreVT.getVectorNumElements(); unsigned HalfSize = StoredVal.getValueSizeInBits() / 2; unsigned HalfAlign = (128 == HalfSize ? 16 : 32); SDLoc DL(Store); SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize); SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize); SDValue Ptr0 = Store->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL); unsigned Alignment = Store->getAlignment(); SDValue Ch0 = DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), Alignment, Store->getMemOperand()->getFlags()); SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1, Store->getPointerInfo().getWithOffset(HalfAlign), MinAlign(Alignment, HalfAlign), Store->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); } /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar /// type. static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG) { SDValue StoredVal = Store->getValue(); assert(StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"); StoredVal = DAG.getBitcast(StoreVT, StoredVal); // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. We are assuming the input op is legal (this transform // is only used for targets with AVX). if (Store->isVolatile()) return SDValue(); MVT StoreSVT = StoreVT.getScalarType(); unsigned NumElems = StoreVT.getVectorNumElements(); unsigned ScalarSize = StoreSVT.getStoreSize(); unsigned Alignment = Store->getAlignment(); SDLoc DL(Store); SmallVector Stores; for (unsigned i = 0; i != NumElems; ++i) { unsigned Offset = i * ScalarSize; SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL); SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, DAG.getIntPtrConstant(i, DL)); SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, Store->getPointerInfo().getWithOffset(Offset), MinAlign(Alignment, Offset), Store->getMemOperand()->getFlags()); Stores.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); } static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { StoreSDNode *St = cast(Op.getNode()); SDLoc dl(St); SDValue StoredVal = St->getValue(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores. if (StoredVal.getValueType().isVector() && StoredVal.getValueType().getVectorElementType() == MVT::i1) { assert(StoredVal.getValueType().getVectorNumElements() <= 8 && "Unexpected VT"); assert(!St->isTruncatingStore() && "Expected non-truncating store"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, DAG.getUNDEF(MVT::v16i1), StoredVal, DAG.getIntPtrConstant(0, dl)); StoredVal = DAG.getBitcast(MVT::i16, StoredVal); StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } if (St->isTruncatingStore()) return SDValue(); // If this is a 256-bit store of concatenated ops, we are better off splitting // that store into two 128-bit stores. This avoids spurious use of 256-bit ops // and each half can execute independently. Some cores would split the op into // halves anyway, so the concat (vinsertf128) is purely an extra op. MVT StoreVT = StoredVal.getSimpleValueType(); if (StoreVT.is256BitVector()) { SmallVector CatOps; if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps)) return splitVectorStore(St, DAG); return SDValue(); } assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != TargetLowering::TypeWidenVector) return SDValue(); MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), StoreVT.getVectorNumElements() * 2); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); if (Subtarget.hasSSE2()) { // Widen the vector, cast to a v2x64 type, extract the single 64-bit element // and store it. MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; MVT CastVT = MVT::getVectorVT(StVT, 2); StoredVal = DAG.getBitcast(CastVT, StoredVal); StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, DAG.getIntPtrConstant(0, dl)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } assert(Subtarget.hasSSE1() && "Expected SSE"); SDVTList Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, St->getMemOperand()); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector loads."); assert(RegVT.isInteger() && "We only custom lower integer vector loads."); LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); EVT MemVT = Ld->getMemoryVT(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. if (RegVT.getVectorElementType() == MVT::i1) { assert(EVT(RegVT) == MemVT && "Expected non-extending load"); assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); // Replace chain users with the new chain. assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!"); SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd); Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT, DAG.getBitcast(MVT::v16i1, Val), DAG.getIntPtrConstant(0, dl)); return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl); } // Nothing useful we can do without SSE2 shuffles. assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); ISD::LoadExtType Ext = Ld->getExtensionType(); assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) && "Only anyext and sext are currently implemented."); assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); unsigned NumElems = RegVT.getVectorNumElements(); unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) { // The only way in which we have a legal 256-bit vector result but not the // integer 256-bit operations needed to directly lower a sextload is if we // have AVX1 but not AVX2. In that case, we can always emit a sextload to // a 128-bit vector and a normal sign_extend to 256-bits that should get // correctly legalized. We do this late to allow the canonical form of // sextload to persist throughout the rest of the DAG combiner -- it wants // to fold together any extensions it can, and so will fuse a sign_extend // of an sextload into a sextload targeting a wider value. SDValue Load; if (MemSz == 128) { // Just switch this to a normal load. assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " "it must be a legal 128-bit vector " "type!"); Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); } else { assert(MemSz < 128 && "Can't extend a type wider than 128 bits to a 256 bit vector!"); // Do an sext load to a 128-bit vector type. We want to use the same // number of elements, but elements half as wide. This will end up being // recursively lowered by this routine, but will succeed as we definitely // have all the necessary features if we're using AVX1. EVT HalfEltVT = EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); Load = DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), MemVT, Ld->getAlignment(), Ld->getMemOperand()->getFlags()); } // Replace chain users with the new chain. assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); // Finally, do a normal sign-extend to the desired register. SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT); return DAG.getMergeValues({SExt, Load.getValue(1)}, dl); } // All sizes must be a power of two. assert(isPowerOf2_32(RegSz * MemSz * NumElems) && "Non-power-of-two elements are not custom lowered!"); // Attempt to load the original value using scalar loads. // Find the largest scalar type that divides the total loaded size. MVT SclrLoadTy = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { SclrLoadTy = Tp; } } // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && (64 <= MemSz)) SclrLoadTy = MVT::f64; // Calculate the number of scalar loads that we need to perform // in order to load our vector from memory. unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && "Can only lower sext loads with a single scalar load!"); unsigned loadRegSize = RegSz; if (Ext == ISD::SEXTLOAD && RegSz >= 256) loadRegSize = 128; // If we don't have BWI we won't be able to create the shuffle needed for // v8i8->v8i64. if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) loadRegSize = 128; // Represent our vector as a sequence of elements which are the // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT( *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits()); // Represent the data using the same element type that is stored in // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), loadRegSize / MemVT.getScalarSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); // We can't shuffle using an illegal type. assert(TLI.isTypeLegal(WideVecVT) && "We only lower types that form legal widened vector types"); SmallVector Chains; SDValue Ptr = Ld->getBasePtr(); unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8; SDValue Increment = DAG.getConstant(OffsetInc, dl, TLI.getPointerTy(DAG.getDataLayout())); SDValue Res = DAG.getUNDEF(LoadUnitVecVT); unsigned Offset = 0; for (unsigned i = 0; i < NumLoads; ++i) { unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset); // Perform a single load. SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo().getWithOffset(Offset), NewAlign, Ld->getMemOperand()->getFlags()); Chains.push_back(ScalarLoad.getValue(1)); // Create the first element type using SCALAR_TO_VECTOR in order to avoid // another round of DAGCombining. if (i == 0) Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); else Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, ScalarLoad, DAG.getIntPtrConstant(i, dl)); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); Offset += OffsetInc; } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) { SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } // Redistribute the loaded elements into the different locations. SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i * SizeRatio] = i; SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), ShuffleVec); // Bitcast to the requested type. Shuff = DAG.getBitcast(RegVT, Shuff); return DAG.getMergeValues({Shuff, TF}, dl); } /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes /// each of which has no other use apart from the AND / OR. static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { Opc = Op.getOpcode(); if (Opc != ISD::OR && Opc != ISD::AND) return false; return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse() && Op.getOperand(1).getOpcode() == X86ISD::SETCC && Op.getOperand(1).hasOneUse()); } /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the /// SETCC node has a single use. static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; if (isOneConstant(Op.getOperand(1))) return Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse(); return false; } SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); SDLoc dl(Op); SDValue CC; bool Inverted = false; if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast(Cond.getOperand(2))->get() == ISD::SETEQ && isNullConstant(Cond.getOperand(1)) && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || Cond.getOperand(0).getOpcode() == ISD::SSUBO || Cond.getOperand(0).getOpcode() == ISD::USUBO || Cond.getOperand(0).getOpcode() == ISD::SMULO || Cond.getOperand(0).getOpcode() == ISD::UMULO)) { Inverted = true; Cond = Cond.getOperand(0); } else { if (SDValue NewCond = LowerSETCC(Cond, DAG)) Cond = NewCond; } } #if 0 // FIXME: LowerXALUO doesn't handle these!! else if (Cond.getOpcode() == X86ISD::ADD || Cond.getOpcode() == X86ISD::SUB || Cond.getOpcode() == X86ISD::SMUL || Cond.getOpcode() == X86ISD::UMUL) Cond = LowerXALUO(Cond, DAG); #endif // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && isOneConstant(Cond.getOperand(1))) Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { Cond = Cmp; addTest = false; } else { switch (cast(CC)->getZExtValue()) { default: break; case X86::COND_O: case X86::COND_B: // These can only come from an arithmetic instruction with overflow, // e.g. SADDO, UADDO. Cond = Cond.getOperand(1); addTest = false; break; } } } CondOpcode = Cond.getOpcode(); if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { SDValue Value; X86::CondCode X86Cond; std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); if (Inverted) X86Cond = X86::GetOppositeBranchCondition(X86Cond); CC = DAG.getConstant(X86Cond, dl, MVT::i8); addTest = false; } else { unsigned CondOpc; if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { SDValue Cmp = Cond.getOperand(0).getOperand(1); if (CondOpc == ISD::OR) { // Also, recognize the pattern generated by an FCMP_UNE. We can emit // two branches instead of an explicit OR instruction with a // separate test. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp)) { CC = Cond.getOperand(0).getOperand(0); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = Cond.getOperand(1).getOperand(0); Cond = Cmp; addTest = false; } } else { // ISD::AND // Also, recognize the pattern generated by an FCMP_OEQ. We can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp) && Op.getNode()->hasOneUse()) { X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, dl, MVT::i8); SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); X86::CondCode CCode = (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, dl, MVT::i8); Cond = Cmp; addTest = false; } } } } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. // It should be transformed during dag combiner except when the condition // is set by a arithmetics with overflow node. X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getConstant(CCode, dl, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETOEQ) { // For FCMP_OEQ, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Op.getNode()->hasOneUse()) { SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } } } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETUNE) { // For FCMP_UNE, we can emit // two branches instead of an explicit OR instruction with a // separate test. SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } } if (addTest) { // Look pass the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { SDValue BTCC; if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) { CC = BTCC; Cond = BT; addTest = false; } } } if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; CC = DAG.getConstant(X86Cond, dl, MVT::i8); Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()), X86Cond, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cond); } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. // Calls to _alloca are needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool EmitStackProbe = !getStackProbeSymbolName(MF).empty(); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || SplitStack || EmitStackProbe; SDLoc dl(Op); // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); unsigned Align = cast(Op.getOperand(2))->getZExtValue(); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); bool Is64Bit = Subtarget.is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) Result = DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { // The 64 bit implementation of segmented stacks needs to clobber both r10 // r11. This makes it impossible to use it along with nested parameters. const Function &F = MF.getFunction(); for (const auto &A : F.args()) { if (A.hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); } } const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); } else { SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size); MF.getInfo()->setHasWinAlloca(true); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); if (Align) { SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } Result = SP; } Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); SDValue Ops[2] = {Result, Chain}; return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const Value *SV = cast(Op.getOperand(2))->getValue(); SDLoc DL(Op); if (!Subtarget.is64Bit() || Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } // __va_list_tag: // gp_offset (0 - 6 * 8) // fp_offset (48 - 48 + 8 * 16) // overflow_arg_area (point to parameters coming in memory). // reg_save_area SmallVector MemOps; SDValue FIN = Op.getOperand(1); // Store gp_offset SDValue Store = DAG.getStore( Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV)); MemOps.push_back(Store); // Store fp_offset FIN = DAG.getMemBasePlusOffset(FIN, 4, DL); Store = DAG.getStore( Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV, 4)); MemOps.push_back(Store); // Store ptr to overflow_arg_area FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8)); MemOps.push_back(Store); // Store ptr to reg_save_area. FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( Subtarget.isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); Store = DAG.getStore( Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12)); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); assert(Op.getNumOperands() == 4); MachineFunction &MF = DAG.getMachineFunction(); if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) // The Win64 ABI uses char* instead of a structure. return DAG.expandVAArg(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); unsigned Align = Op.getConstantOperandVal(3); SDLoc dl(Op); EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. // TODO: Implement the AMD64 ABI in its entirety. This simple // selection mechanism works only for the basic types. if (ArgVT == MVT::f80) { llvm_unreachable("va_arg for f80 not yet implemented"); } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { ArgMode = 2; // Argument passed in XMM register. Use fp_offset. } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. } else { llvm_unreachable("Unhandled argument type in LowerVAARG"); } if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()); } // Insert VAARG_64 node into the DAG // VAARG_64 returns two values: Variable Argument Address, Chain SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), DAG.getConstant(ArgMode, dl, MVT::i8), DAG.getConstant(Align, dl, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode( X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV), /*Align=*/0, MachineMemOperand::MOLoad | MachineMemOperand::MOStore); Chain = VAARG.getValue(1); // Load the next argument and return it return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo()); } static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, // where a va_list is still an i8*. assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"); if (Subtarget.isCallingConvWin64( DAG.getMachineFunction().getFunction().getCallingConv())) // Probably a Win64 va_copy. return DAG.expandVACopy(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); SDLoc DL(Op); return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false, false, false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } // Helper to get immediate/variable SSE shift opcode from other shift opcodes. static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) { switch (Opc) { case ISD::SHL: case X86ISD::VSHL: case X86ISD::VSHLI: return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI; case ISD::SRL: case X86ISD::VSRL: case X86ISD::VSRLI: return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI; case ISD::SRA: case X86ISD::VSRA: case X86ISD::VSRAI: return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI; } llvm_unreachable("Unknown target vector shift node"); } /// Handle vector element shifts where the shift amount is a constant. /// Takes immediate version of shift as input. static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); // Bitcast the source vector to the output type, this is mainly necessary for // vXi8/vXi64 shifts. if (VT != SrcOp.getSimpleValueType()) SrcOp = DAG.getBitcast(VT, SrcOp); // Fold this packed shift into its first operand if ShiftAmt is 0. if (ShiftAmt == 0) return SrcOp; // Check for ShiftAmt >= element width if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) ShiftAmt = ElementType.getSizeInBits() - 1; else return DAG.getConstant(0, dl, VT); } assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a // vector of Constants or UNDEFs. if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector Elts; unsigned NumElts = SrcOp->getNumOperands(); switch (Opc) { default: llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } auto *ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRLI: for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } auto *ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRAI: for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { Elts.push_back(CurrentOp); continue; } auto *ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); } break; } return DAG.getBuildVector(VT, dl, Elts); } return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, dl, MVT::i8)); } /// Handle vector element shifts where the shift amount may or may not be a /// constant. Takes immediate version of shift as input. static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT SVT = ShAmt.getSimpleValueType(); assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); // Catch shift-by-constant. if (ConstantSDNode *CShAmt = dyn_cast(ShAmt)) return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, CShAmt->getZExtValue(), DAG); // Change opcode to non-immediate version. Opc = getTargetVShiftUniformOpcode(Opc, true); // Need to build a vector containing shift amount. // SSE/AVX packed shifts only use the lower 64-bit of the shift count. // +====================+============+=======================================+ // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as | // +====================+============+=======================================+ // | i64 | Yes, No | Use ShAmt as lowest elt | // | i32 | Yes | zero-extend in-reg | // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg | // | (i32 zext(i16/i8)) | No | byte-shift-in-reg | // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) | // +====================+============+=======================================+ if (SVT == MVT::i64) ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt); else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND && ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 || ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) { ShAmt = ShAmt.getOperand(0); MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16; ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt); if (Subtarget.hasSSE41()) ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt), MVT::v2i64, ShAmt); else { SDValue ByteShift = DAG.getConstant( (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8); ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt); ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, ByteShift); ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, ByteShift); } } else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt), MVT::v2i64, ShAmt); } else { SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps); } // The return type has to be a 128-bit type with the same element // type as the input type. MVT EltVT = VT.getVectorElementType(); MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits()); ShAmt = DAG.getBitcast(ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } /// Return Mask with the necessary casting or extending /// for \p Mask according to \p MaskVT when lowering masking intrinsics static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { if (isAllOnesConstant(Mask)) return DAG.getConstant(1, dl, MaskVT); if (X86::isZeroNode(Mask)) return DAG.getConstant(0, dl, MaskVT); assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!"); if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) { assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!"); assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); // In case 32bit mode, bitcast i64 is illegal, extend/split it. SDValue Lo, Hi; Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, DAG.getConstant(0, dl, MVT::i32)); Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, DAG.getConstant(1, dl, MVT::i32)); Lo = DAG.getBitcast(MVT::v32i1, Lo); Hi = DAG.getBitcast(MVT::v32i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } else { MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getBitcast(BitcastVT, Mask), DAG.getIntPtrConstant(0, dl)); } } /// Return (and \p Op, \p Mask) for compare instructions or /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the /// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); unsigned OpcodeSelect = ISD::VSELECT; SDLoc dl(Op); if (isAllOnesConstant(Mask)) return Op; SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). /// The mask is coming as MVT::i8 and it should be transformed /// to MVT::v1i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node /// for a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (auto *MaskConst = dyn_cast(Mask)) if (MaskConst->getZExtValue() & 0x1) return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert(Mask.getValueType() == MVT::i8 && "Unexpect type"); SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1, DAG.getBitcast(MVT::v8i1, Mask), DAG.getIntPtrConstant(0, dl)); if (Op.getOpcode() == X86ISD::FSETCCM || Op.getOpcode() == X86ISD::FSETCCM_SAE || Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc); } static int getSEHRegistrationNodeSize(const Function *Fn) { if (!Fn->hasPersonalityFn()) report_fatal_error( "querying registration node size for function without personality"); // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See // WinEHStatePass for the full struct definition. switch (classifyEHPersonality(Fn->getPersonalityFn())) { case EHPersonality::MSVC_X86SEH: return 24; case EHPersonality::MSVC_CXX: return 16; default: break; } report_fatal_error( "can only recover FP for 32-bit MSVC EH personality functions"); } /// When the MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. /// Here's the math: /// RegNodeBase = EntryEBP - RegNodeSize /// ParentFP = RegNodeBase - ParentFrameOffset /// Subtracting RegNodeSize takes us to the offset of the registration node, and /// subtracting the offset (negative on x86) takes us back to the parent FP. static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP) { MachineFunction &MF = DAG.getMachineFunction(); SDLoc dl; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); // It's possible that the parent function no longer has a personality function // if the exceptional code was optimized away, in which case we just return // the incoming EBP. if (!Fn->hasPersonalityFn()) return EntryEBP; // Get an MCSymbol that will ultimately resolve to the frame offset of the EH // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( GlobalValue::dropLLVMManglingEscape(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after // prologue to RBP in the parent function. const X86Subtarget &Subtarget = static_cast(DAG.getSubtarget()); if (Subtarget.is64Bit()) return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); int RegNodeSize = getSEHRegistrationNodeSize(Fn); // RegNodeBase = EntryEBP - RegNodeSize // ParentFP = RegNodeBase - ParentFrameOffset SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, DAG.getConstant(RegNodeSize, dl, PtrVT)); return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { // Helper to detect if the operand is CUR_DIRECTION rounding mode. auto isRoundModeCurDirection = [](SDValue Rnd) { if (auto *C = dyn_cast(Rnd)) return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; return false; }; auto isRoundModeSAE = [](SDValue Rnd) { if (auto *C = dyn_cast(Rnd)) return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC; return false; }; auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) { if (auto *C = dyn_cast(Rnd)) { RC = C->getZExtValue(); if (RC & X86::STATIC_ROUNDING::NO_EXC) { // Clear the NO_EXC bit and check remaining bits. RC ^= X86::STATIC_ROUNDING::NO_EXC; return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT || RC == X86::STATIC_ROUNDING::TO_NEG_INF || RC == X86::STATIC_ROUNDING::TO_POS_INF || RC == X86::STATIC_ROUNDING::TO_ZERO; } } return false; }; SDLoc dl(Op); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { switch(IntrData->Type) { case INTR_TYPE_1OP: { // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(2); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Op.getOperand(1), DAG.getTargetConstant(RC, dl, MVT::i32)); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); } case INTR_TYPE_1OP_SAE: { SDValue Sae = Op.getOperand(2); unsigned Opc; if (isRoundModeCurDirection(Sae)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else return SDValue(); return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1)); } case INTR_TYPE_2OP: { SDValue Src2 = Op.getOperand(2); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(3); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Op.getOperand(1), Src2, DAG.getTargetConstant(RC, dl, MVT::i32)); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Src2); } case INTR_TYPE_2OP_SAE: { SDValue Sae = Op.getOperand(3); unsigned Opc; if (isRoundModeCurDirection(Sae)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else return SDValue(); return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case INTR_TYPE_3OP: case INTR_TYPE_3OP_IMM8: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); if (IntrData->Type == INTR_TYPE_3OP_IMM8) Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, DAG.getTargetConstant(RC, dl, MVT::i32)); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src1, Src2, Src3); } case INTR_TYPE_4OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); // We add rounding mode to the Node when // - RC Opcode is specified and // - RC is not "current direction". unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return getVectorMaskingNode( DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, DAG.getTargetConstant(RC, dl, MVT::i32)), Mask, PassThru, Subtarget, DAG); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_1OP_MASK_SAE: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue Rnd = Op.getOperand(4); unsigned Opc; if (isRoundModeCurDirection(Rnd)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Rnd)) Opc = IntrData->Opc1; else return SDValue(); return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; // There are 2 kinds of intrinsics in this group: // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands // (2) With rounding mode and sae - 7 operands. bool HasRounding = IntrWithRoundingModeOpcode != 0; if (Op.getNumOperands() == (5U + HasRounding)) { if (HasRounding) { SDValue Rnd = Op.getOperand(5); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return getScalarMaskingNode( DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2, DAG.getTargetConstant(RC, dl, MVT::i32)), Mask, passThru, Subtarget, DAG); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru, Subtarget, DAG); } assert(Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"); SDValue RoundingMode = Op.getOperand(5); unsigned Opc = IntrData->Opc0; if (HasRounding) { SDValue Sae = Op.getOperand(6); if (isRoundModeSAE(Sae)) Opc = IntrWithRoundingModeOpcode; else if (!isRoundModeCurDirection(Sae)) return SDValue(); } return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, RoundingMode), Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RND: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue Rnd = Op.getOperand(5); SDValue NewOp; unsigned RC = 0; if (isRoundModeCurDirection(Rnd)) NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); else if (isRoundModeSAEToX(Rnd, RC)) NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, DAG.getTargetConstant(RC, dl, MVT::i32)); else return SDValue(); return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue Sae = Op.getOperand(5); unsigned Opc; if (isRoundModeCurDirection(Sae)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else return SDValue(); return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask, passThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue NewOp; if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, DAG.getTargetConstant(RC, dl, MVT::i32)); else if (!isRoundModeCurDirection(Rnd)) return SDValue(); } if (!NewOp) NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); unsigned Opc = IntrData->Opc0; if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(5); if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else if (!isRoundModeCurDirection(Sae)) return SDValue(); } return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_SCALAR_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Sae = Op.getOperand(6); unsigned Opc; if (isRoundModeCurDirection(Sae)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else return SDValue(); return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); unsigned Opc = IntrData->Opc0; if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(6); if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else if (!isRoundModeCurDirection(Sae)) return SDValue(); } return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case BLENDV: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger(); Src3 = DAG.getBitcast(MaskVT, Src3); // Reverse the operands to match VSELECT order. return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1); } case VPERM_2OP : { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); // Swap Src1 and Src2 in the node creation return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1); } case IFMA_OP: // NOTE: We need to swizzle the operands to pass the multiply operands // first. return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case FPCLASSS: { SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, DAG.getConstant(0, dl, MVT::v8i1), FPclassMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(MVT::i8, Ins); } case CMP_MASK_CC: { MVT MaskVT = Op.getSimpleValueType(); SDValue CC = Op.getOperand(3); CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(4); if (isRoundModeSAE(Sae)) return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC, Sae); if (!isRoundModeCurDirection(Sae)) return SDValue(); } //default rounding mode return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC); } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); SDValue Mask = Op.getOperand(4); SDValue Cmp; if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(5); if (isRoundModeSAE(Sae)) Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae); else if (!isRoundModeCurDirection(Sae)) return SDValue(); } //default rounding mode if (!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, DAG.getConstant(0, dl, MVT::v8i1), CmpMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(MVT::i8, Ins); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS); SDValue SetCC; switch (CC) { case ISD::SETEQ: { // (ZF = 0 and PF = 0) SetCC = getSETCC(X86::COND_E, Comi, dl, DAG); SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG); SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP); break; } case ISD::SETNE: { // (ZF = 1 or PF = 1) SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG); SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG); SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP); break; } case ISD::SETGT: // (CF = 0 and ZF = 0) SetCC = getSETCC(X86::COND_A, Comi, dl, DAG); break; case ISD::SETLT: { // The condition is opposite to GT. Swap the operands. SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG); break; } case ISD::SETGE: // CF = 0 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG); break; case ISD::SETLE: // The condition is opposite to GE. Swap the operands. SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG); break; default: llvm_unreachable("Unexpected illegal condition!"); } return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case COMI_RM: { // Comparison intrinsics with Sae SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); unsigned CondVal = cast(Op.getOperand(3))->getZExtValue(); SDValue Sae = Op.getOperand(4); SDValue FCmp; if (isRoundModeCurDirection(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8)); else if (isRoundModeSAE(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8), Sae); else return SDValue(); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, DAG.getConstant(0, dl, MVT::v16i1), FCmp, DAG.getIntPtrConstant(0, dl)); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, DAG.getBitcast(MVT::i16, Ins)); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), Subtarget, DAG); case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is return Op.getOperand(1); // Avoid false dependency. if (PassThru.isUndef()) PassThru = DAG.getConstant(0, dl, VT); return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru, Mask); } case FIXUPIMM: case FIXUPIMM_MASKZ: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Imm = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Passthru = (IntrData->Type == FIXUPIMM) ? Src1 : getZeroVector(VT, Subtarget, DAG, dl); unsigned Opc = IntrData->Opc0; if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(6); if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else if (!isRoundModeCurDirection(Sae)) return SDValue(); } SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm); if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE) return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); } case ROUNDP: { assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, Op.getOperand(2), DAG.getConstant(0xf, dl, MVT::i32)); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), RoundingMode); } case ROUNDS: { assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, Op.getOperand(3), DAG.getConstant(0xf, dl, MVT::i32)); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), RoundingMode); } // ADC/ADCX/SBB case ADX: { SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32); SDValue Res; // If the carry in is zero, then we should just use ADD/SUB instead of // ADC/SBB. if (isNullConstant(Op.getOperand(1))) { Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2), Op.getOperand(3)); } else { SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1), DAG.getConstant(-1, dl, MVT::i8)); Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2), Op.getOperand(3), GenCF.getValue(1)); } SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG); SDValue Results[] = { SetCC, Res }; return DAG.getMergeValues(Results, dl); } case CVTPD2PS_MASK: case CVTPD2DQ_MASK: case CVTQQ2PS_MASK: case TRUNCATE_TO_REG: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); if (isAllOnesConstant(Mask)) return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); MVT SrcVT = Src.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, Mask); } case CVTPS2PH_MASK: { SDValue Src = Op.getOperand(1); SDValue Rnd = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); if (isAllOnesConstant(Mask)) return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd); MVT SrcVT = Src.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd, PassThru, Mask); } case CVTNEPS2BF16_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); if (ISD::isBuildVectorAllOnes(Mask.getNode())) return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); // Break false dependency. if (PassThru.isUndef()) PassThru = DAG.getConstant(0, dl, PassThru.getValueType()); return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, Mask); } default: break; } } switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. case Intrinsic::x86_avx512_ktestc_b: case Intrinsic::x86_avx512_ktestc_w: case Intrinsic::x86_avx512_ktestc_d: case Intrinsic::x86_avx512_ktestc_q: case Intrinsic::x86_avx512_ktestz_b: case Intrinsic::x86_avx512_ktestz_w: case Intrinsic::x86_avx512_ktestz_d: case Intrinsic::x86_avx512_ktestz_q: case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestz_256: case Intrinsic::x86_avx_ptestc_256: case Intrinsic::x86_avx_ptestnzc_256: case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: case Intrinsic::x86_avx_vtestc_pd_256: case Intrinsic::x86_avx_vtestnzc_pd_256: { unsigned TestOpc = X86ISD::PTEST; X86::CondCode X86CC; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); case Intrinsic::x86_avx512_ktestc_b: case Intrinsic::x86_avx512_ktestc_w: case Intrinsic::x86_avx512_ktestc_d: case Intrinsic::x86_avx512_ktestc_q: // CF = 1 TestOpc = X86ISD::KTEST; X86CC = X86::COND_B; break; case Intrinsic::x86_avx512_ktestz_b: case Intrinsic::x86_avx512_ktestz_w: case Intrinsic::x86_avx512_ktestz_d: case Intrinsic::x86_avx512_ktestz_q: TestOpc = X86ISD::KTEST; X86CC = X86::COND_E; break; case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: TestOpc = X86ISD::TESTP; LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_avx_ptestz_256: // ZF = 1 X86CC = X86::COND_E; break; case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestc_pd_256: TestOpc = X86ISD::TESTP; LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_avx_ptestc_256: // CF = 1 X86CC = X86::COND_B; break; case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestnzc_pd_256: TestOpc = X86ISD::TESTP; LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestnzc_256: // ZF and CF = 0 X86CC = X86::COND_A; break; } SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: case Intrinsic::x86_sse42_pcmpestric128: case Intrinsic::x86_sse42_pcmpistrio128: case Intrinsic::x86_sse42_pcmpestrio128: case Intrinsic::x86_sse42_pcmpistris128: case Intrinsic::x86_sse42_pcmpestris128: case Intrinsic::x86_sse42_pcmpistriz128: case Intrinsic::x86_sse42_pcmpestriz128: { unsigned Opcode; X86::CondCode X86CC; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse42_pcmpistria128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpestria128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpistric128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpestric128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpistrio128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpestrio128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpistris128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpestris128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpistriz128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_E; break; case Intrinsic::x86_sse42_pcmpestriz128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_E; break; } SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2); SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_sse42_pcmpistri128: case Intrinsic::x86_sse42_pcmpestri128: { unsigned Opcode; if (IntNo == Intrinsic::x86_sse42_pcmpistri128) Opcode = X86ISD::PCMPISTR; else Opcode = X86ISD::PCMPESTR; SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } case Intrinsic::x86_sse42_pcmpistrm128: case Intrinsic::x86_sse42_pcmpestrm128: { unsigned Opcode; if (IntNo == Intrinsic::x86_sse42_pcmpistrm128) Opcode = X86ISD::PCMPISTR; else Opcode = X86ISD::PCMPESTR; SmallVector NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1); } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); auto &Context = MF.getMMI().getContext(); MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") + Twine(MF.getFunctionNumber())); return DAG.getNode(getGlobalWrapperKind(), dl, VT, DAG.getMCSymbol(S, PtrVT)); } case Intrinsic::x86_seh_lsda: { // Compute the symbol for the LSDA. We know it'll get emitted later. MachineFunction &MF = DAG.getMachineFunction(); SDValue Op1 = Op.getOperand(1); auto *Fn = cast(cast(Op1)->getGlobal()); MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( GlobalValue::dropLLVMManglingEscape(Fn->getName())); // Generate a simple absolute symbol reference. This intrinsic is only // supported on 32-bit Windows, which isn't PIC. SDValue Result = DAG.getMCSymbol(LSDASym, VT); return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); } case Intrinsic::eh_recoverfp: { SDValue FnOp = Op.getOperand(1); SDValue IncomingFPOp = Op.getOperand(2); GlobalAddressSDNode *GSD = dyn_cast(FnOp); auto *Fn = dyn_cast_or_null(GSD ? GSD->getGlobal() : nullptr); if (!Fn) report_fatal_error( "llvm.eh.recoverfp must take a function as the first argument"); return recoverFramePointer(DAG, Fn, IncomingFPOp); } case Intrinsic::localaddress: { // Returns one of the stack, base, or frame pointer registers, depending on // which is used to reference local variables. MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned Reg; if (RegInfo->hasBasePointer(MF)) Reg = RegInfo->getBaseRegister(); else { // Handles the SP or FP case. bool CantUseFP = RegInfo->needsStackRealignment(MF); if (CantUseFP) Reg = RegInfo->getPtrSizedStackRegister(MF); else Reg = RegInfo->getPtrSizedFrameRegister(MF); } return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } case Intrinsic::x86_avx512_vp2intersect_q_512: case Intrinsic::x86_avx512_vp2intersect_q_256: case Intrinsic::x86_avx512_vp2intersect_q_128: case Intrinsic::x86_avx512_vp2intersect_d_512: case Intrinsic::x86_avx512_vp2intersect_d_256: case Intrinsic::x86_avx512_vp2intersect_d_128: { MVT MaskVT = Op.getSimpleValueType(); SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); SDLoc DL(Op); SDValue Operation = DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs, Op->getOperand(1), Op->getOperand(2)); SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation); SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation); return DAG.getMergeValues({Result0, Result1}, DL); } } } static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); MemIntrinsicSDNode *MemIntr = cast(Op); SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; SDValue Res = DAG.getTargetMemSDNode( VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), VT.getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); // We support two versions of the gather intrinsics. One with scalar mask and // one with vXi1 mask. Convert scalar to vXi1 if necessary. if (Mask.getValueType() != MaskVT) Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); MemIntrinsicSDNode *MemIntr = cast(Op); SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; SDValue Res = DAG.getTargetMemSDNode( VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), Src.getSimpleValueType().getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); // We support two versions of the scatter intrinsics. One with scalar mask and // one with vXi1 mask. Convert scalar to vXi1 if necessary. if (Mask.getValueType() != MaskVT) Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); MemIntrinsicSDNode *MemIntr = cast(Op); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale}; SDValue Res = DAG.getTargetMemSDNode( VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); return Res.getValue(1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); } /// Handles the lowering of builtin intrinsics with chain that return their /// value into registers EDX:EAX. /// If operand ScrReg is a valid register identifier, then operand 2 of N is /// copied to SrcReg. The assumption is that SrcReg is an implicit input to /// TargetOpcode. /// Returns a Glue value which can be used to add extra copy-from-reg if the /// expanded intrinsics implicitly defines extra registers (i.e. not just /// EDX:EAX). static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl &Results) { SDValue Chain = N->getOperand(0); SDValue Glue; if (SrcReg) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue); Glue = Chain.getValue(1); } SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue N1Ops[] = {Chain, Glue}; SDNode *N1 = DAG.getMachineNode( TargetOpcode, DL, Tys, ArrayRef(N1Ops, Glue.getNode() ? 2 : 1)); Chain = SDValue(N1, 0); // Reads the content of XCR and returns it in registers EDX:EAX. SDValue LO, HI; if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); } else { LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, LO.getValue(2)); } Chain = HI.getValue(1); Glue = HI.getValue(2); if (Subtarget.is64Bit()) { // Merge the two 32-bit values into a 64-bit one. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return Glue; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { LO, HI }; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); return Glue; } /// Handles the lowering of builtin intrinsics that read the time stamp counter /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower /// READCYCLECOUNTER nodes. static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl &Results) { // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode, /* NoRegister */0, Subtarget, Results); if (Opcode != X86::RDTSCP) return; SDValue Chain = Results[1]; // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into // the ECX register. Add 'ecx' explicitly to the chain. SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue); Results[1] = ecx; Results.push_back(ecx.getValue(1)); } static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector Results; SDLoc DL(Op); getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget, Results); return DAG.getMergeValues(Results, DL); } static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue RegNode = Op.getOperand(2); WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); if (!EHInfo) report_fatal_error("EH registrations only live in functions using WinEH"); // Cast the operand to an alloca, and remember the frame index. auto *FINode = dyn_cast(RegNode); if (!FINode) report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); // Return the chain operand without making any DAG nodes. return Chain; } static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue EHGuard = Op.getOperand(2); WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); if (!EHInfo) report_fatal_error("EHGuard only live in functions using WinEH"); // Cast the operand to an alloca, and remember the frame index. auto *FINode = dyn_cast(EHGuard); if (!FINode) report_fatal_error("llvm.x86.seh.ehguard expects a static alloca"); EHInfo->EHGuardFrameIndex = FINode->getIndex(); // Return the chain operand without making any DAG nodes. return Chain; } /// Emit Truncating Store with signed or unsigned saturation. static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Undef = DAG.getUNDEF(Ptr.getValueType()); SDValue Ops[] = { Chain, Val, Ptr, Undef }; return SignedSat ? DAG.getTargetMemSDNode(VTs, Ops, Dl, MemVT, MMO) : DAG.getTargetMemSDNode(VTs, Ops, Dl, MemVT, MMO); } /// Emit Masked Truncating Store with signed or unsigned saturation. static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Val, Ptr, Mask }; return SignedSat ? DAG.getTargetMemSDNode(VTs, Ops, Dl, MemVT, MMO) : DAG.getTargetMemSDNode(VTs, Ops, Dl, MemVT, MMO); } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { switch (IntNo) { case llvm::Intrinsic::x86_seh_ehregnode: return MarkEHRegistrationNode(Op, DAG); case llvm::Intrinsic::x86_seh_ehguard: return MarkEHGuard(Op, DAG); case llvm::Intrinsic::x86_rdpkru: { SDLoc dl(Op); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); // Create a RDPKRU node and pass 0 to the ECX parameter. return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0), DAG.getConstant(0, dl, MVT::i32)); } case llvm::Intrinsic::x86_wrpkru: { SDLoc dl(Op); // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0 // to the EDX and ECX parameters. return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, Op.getOperand(0), Op.getOperand(2), DAG.getConstant(0, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); } case llvm::Intrinsic::x86_flags_read_u32: case llvm::Intrinsic::x86_flags_read_u64: case llvm::Intrinsic::x86_flags_write_u32: case llvm::Intrinsic::x86_flags_write_u64: { // We need a frame pointer because this will get lowered to a PUSH/POP // sequence. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later // during FinalizeISel in EmitInstrWithCustomInserter. return SDValue(); } case Intrinsic::x86_lwpins32: case Intrinsic::x86_lwpins64: case Intrinsic::x86_umwait: case Intrinsic::x86_tpause: { SDLoc dl(Op); SDValue Chain = Op->getOperand(0); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); case Intrinsic::x86_umwait: Opcode = X86ISD::UMWAIT; break; case Intrinsic::x86_tpause: Opcode = X86ISD::TPAUSE; break; case Intrinsic::x86_lwpins32: case Intrinsic::x86_lwpins64: Opcode = X86ISD::LWPINS; break; } SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2), Op->getOperand(3), Op->getOperand(4)); SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } case Intrinsic::x86_enqcmd: case Intrinsic::x86_enqcmds: { SDLoc dl(Op); SDValue Chain = Op.getOperand(0); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic!"); case Intrinsic::x86_enqcmd: Opcode = X86ISD::ENQCMD; break; case Intrinsic::x86_enqcmds: Opcode = X86ISD::ENQCMDS; break; } SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2), Op.getOperand(3)); SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } } return SDValue(); } SDLoc dl(Op); switch(IntrData->Type) { default: llvm_unreachable("Unknown Intrinsic Type"); case RDSEED: case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other); SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), DAG.getConstant(1, dl, Op->getValueType(1)), DAG.getConstant(X86::COND_B, dl, MVT::i8), SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } case GATHER_AVX2: { SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case GATHER: { //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { //scatter(base, mask, index, v1, scale); SDValue Chain = Op.getOperand(0); SDValue Base = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case PREFETCH: { SDValue Hint = Op.getOperand(6); unsigned HintVal = cast(Hint)->getZExtValue(); assert((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"); unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, Subtarget); } // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { SmallVector Results; getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. case RDPMC: // GetExtended Control Register. case XGETBV: { SmallVector Results; // RDPMC uses ECX to select the index of the performance counter to read. // XGETBV uses ECX to select the index of the XCR register to return. // The result is stored into registers EDX:EAX. expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG); SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, SDValue(InTrans.getNode(), 1)); } case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { SDValue Mask = Op.getOperand(4); SDValue DataToTruncate = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MemIntrinsicSDNode *MemIntr = dyn_cast(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); EVT MemVT = MemIntr->getMemoryVT(); uint16_t TruncationOp = IntrData->Opc0; switch (TruncationOp) { case X86ISD::VTRUNC: { if (isAllOnesConstant(Mask)) // return just a truncate store return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT, MemIntr->getMemOperand()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT, MemIntr->getMemOperand(), true /* truncating */); } case X86ISD::VTRUNCUS: case X86ISD::VTRUNCS: { bool IsSigned = (TruncationOp == X86ISD::VTRUNCS); if (isAllOnesConstant(Mask)) return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT, MemIntr->getMemOperand(), DAG); MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, VMask, MemVT, MemIntr->getMemOperand(), DAG); } default: llvm_unreachable("Unsupported truncstore intrinsic"); } } } } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo()); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, MachinePointerInfo()); } SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const { DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true); return getReturnAddressFrameIndex(DAG); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); EVT VT = Op.getValueType(); MFI.setFrameAddressIsTaken(true); if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { // Depth > 0 makes no sense on targets which use Windows unwind codes. It // is not possible to crawl up the stack without looking at the unwind codes // simultaneously. int FrameAddrIndex = FuncInfo->getFAIndex(); if (!FrameAddrIndex) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); FrameAddrIndex = MF.getFrameInfo().CreateFixedObject( SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false); FuncInfo->setFAIndex(FrameAddrIndex); } return DAG.getFrameIndex(FrameAddrIndex, VT); } unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const MachineFunction &MF = DAG.getMachineFunction(); unsigned Reg = StringSwitch(RegName) .Case("esp", X86::ESP) .Case("rsp", X86::RSP) .Case("ebp", X86::EBP) .Case("rbp", X86::RBP) .Default(0); if (Reg == X86::EBP || Reg == X86::RBP) { if (!TFI.hasFP(MF)) report_fatal_error("register " + StringRef(RegName) + " is allocatable: function has no frame pointer"); #ifndef NDEBUG else { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"); } #endif } if (Reg) return Reg; report_fatal_error("Invalid register name global variable"); } SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } unsigned X86TargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX; } unsigned X86TargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Funclet personalities don't use selectors (the runtime does the selection). assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; } bool X86TargetLowering::needsFixedCatchObjects() const { return Subtarget.isTargetWin64(); } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); SDLoc dl (Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"); SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, DAG.getIntPtrConstant(RegInfo->getSlotSize(), dl)); StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo()); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, DAG.getRegister(StoreAddrReg, PtrVT)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); // If the subtarget is not 64bit, we may need the global base reg // after isel expand pseudo, i.e., after CGBR pass ran. // Therefore, ask for the GlobalBaseReg now, so that the pass // inserts the code for us in case we need it. // Otherwise, we will end up in a situation where we will // reference a virtual register that is not defined! if (!Subtarget.is64Bit()) { const X86InstrInfo *TII = Subtarget.getInstrInfo(); (void)TII->getGlobalBaseReg(&DAG.getMachineFunction()); } return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); } SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other, Op.getOperand(0)); } static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { return Op.getOperand(0); } SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { SDValue Root = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value SDLoc dl (Op); const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); if (Subtarget.is64Bit()) { SDValue OutChains[6]; // Large code-model. const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix // Load the pointer to the nested function into R11. unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, dl, MVT::i64)); OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2), /* Alignment = */ 2); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(10, dl, MVT::i64)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 10)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, dl, MVT::i64)); OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12), /* Alignment = */ 2); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(20, dl, MVT::i64)); OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 20)); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(22, dl, MVT::i64)); OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 22)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { const Function *Func = cast(cast(Op.getOperand(5))->getValue()); CallingConv::ID CC = Func->getCallingConv(); unsigned NestReg; switch (CC) { default: llvm_unreachable("Unsupported calling convention"); case CallingConv::C: case CallingConv::X86_StdCall: { // Pass 'nest' parameter in ECX. // Must be kept in sync with X86CallingConv.td NestReg = X86::ECX; // Check that ECX wasn't needed by an 'inreg' parameter. FunctionType *FTy = Func->getFunctionType(); const AttributeList &Attrs = Func->getAttributes(); if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; unsigned Idx = 1; for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) if (Attrs.hasAttribute(Idx, Attribute::InReg)) { auto &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; } if (InRegCount > 2) { report_fatal_error("Nest register in use - reduce number of inreg" " parameters!"); } } break; } case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::Fast: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td NestReg = X86::EAX; break; } SDValue OutChains[4]; SDValue Addr, Disp; Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(10, dl, MVT::i32)); Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8), Trmp, MachinePointerInfo(TrmpAddr)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, dl, MVT::i32)); OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1), /* Alignment = */ 1); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, dl, MVT::i32)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 5), /* Alignment = */ 1); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, dl, MVT::i32)); OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6), /* Alignment = */ 1); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } } SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { /* The rounding mode is in bits 11:10 of FPSR, and has the following settings: 00 Round to nearest 01 Round to -inf 10 Round to +inf 11 Round to 0 FLT_ROUNDS, on the other hand, expects the following: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to +inf 3 Round to -inf To perform the conversion, we do: (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) */ MachineFunction &MF = DAG.getMachineFunction(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), MachineMemOperand::MOStore, 2, 2); SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo()); // Transform as necessary SDValue CWD1 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x800, DL, MVT::i16)), DAG.getConstant(11, DL, MVT::i8)); SDValue CWD2 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0x400, DL, MVT::i16)), DAG.getConstant(9, DL, MVT::i8)); SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i16, DAG.getNode(ISD::ADD, DL, MVT::i16, DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), DAG.getConstant(1, DL, MVT::i16)), DAG.getConstant(3, DL, MVT::i16)); return DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } // Split an unary integer op into 2 half sized ops. static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); unsigned NumElems = VT.getVectorNumElements(); unsigned SizeInBits = VT.getSizeInBits(); MVT EltVT = VT.getVectorElementType(); SDValue Src = Op.getOperand(0); assert(EltVT == Src.getSimpleValueType().getVectorElementType() && "Src and Op should have the same element type!"); // Extract the Lo/Hi vectors SDLoc dl(Op); SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2); SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2); MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, Lo), DAG.getNode(Op.getOpcode(), dl, NewVT, Hi)); } // Decompose 256-bit ops into smaller 128-bit ops. static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return LowerVectorIntUnary(Op, DAG); } // Decompose 512-bit ops into smaller 256-bit ops. static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 512-bit vector integer operation"); return LowerVectorIntUnary(Op, DAG); } /// Lower a vector CTLZ using native supported vector CTLZ instruction. // // i8/i16 vector implemented using dword LZCNT vector instruction // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, // split the vector, perform operation on it's Lo a Hi part and // concatenate the results. static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(Op.getOpcode() == ISD::CTLZ); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"); // Split vector, it's Lo and Hi parts will be handled in next iteration. if (NumElems > 16 || (NumElems == 16 && !Subtarget.canExtendTo512DQ())) return LowerVectorIntUnary(Op, DAG); MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && "Unsupported value type for operation"); // Use native supported vector instruction vplzcntd. Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); } // Lower CTLZ using a PSHUFB lookup table implementation. static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); int NumElts = VT.getVectorNumElements(); int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8); MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes); // Per-nibble leading zero PSHUFB lookup table. const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2, /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1, /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0, /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0}; SmallVector LUTVec; for (int i = 0; i < NumBytes; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec); // Begin by bitcasting the input to byte vector, then split those bytes // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them. // If the hi input nibble is zero then we add both results together, otherwise // we just take the hi result (by masking the lo result to zero before the // add). SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0)); SDValue Zero = DAG.getConstant(0, DL, CurrVT); SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); SDValue Lo = Op0; SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); SDValue HiZ; if (CurrVT.is512BitVector()) { MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ); HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); } else { HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ); } Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo); Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi); Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ); SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi); // Merge result back from vXi8 back to VT, working on the lo/hi halves // of the current vector width in the same way we did for the nibbles. // If the upper half of the input element is zero then add the halves' // leading zero counts together, otherwise just use the upper half's. // Double the width of the result until we are at target width. while (CurrVT != VT) { int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits(); int CurrNumElts = CurrVT.getVectorNumElements(); MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2); MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2); SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT); // Check if the upper half of the input element is zero. if (CurrVT.is512BitVector()) { MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0), DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); } else { HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0), DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); } HiZ = DAG.getBitcast(NextVT, HiZ); // Move the upper/lower halves to the lower bits as we'll be extending to // NextVT. Mask the lower result to zero if HiZ is true and add the results // together. SDValue ResNext = Res = DAG.getBitcast(NextVT, Res); SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift); SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift); R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1); Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1); CurrVT = NextVT; } return Res; } static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (Subtarget.hasCDI() && // vXi8 vectors need to be promoted to 512-bits for vXi32. (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8)) return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); // Decompose 512-bit ops into smaller 256-bit ops. if (VT.is512BitVector() && !Subtarget.hasBWI()) return Lower512IntUnary(Op, DAG); assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); } static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); unsigned Opc = Op.getOpcode(); if (VT.isVector()) return LowerVectorCTLZ(Op, dl, Subtarget, DAG); Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } // Issue a bsr (scan bits in reverse) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); if (Opc == ISD::CTLZ) { // If src is zero (i.e. bsr sets ZF), returns NumBits. SDValue Ops[] = { Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), DAG.getConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1) }; Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); } // Finally xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits - 1, dl, OpVT)); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); return Op; } static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); unsigned NumBits = VT.getScalarSizeInBits(); SDValue N0 = Op.getOperand(0); SDLoc dl(Op); assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"); // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { Op, DAG.getConstant(NumBits, dl, VT), DAG.getConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1) }; return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } /// Break a 256-bit integer operation into two new 128-bit ones and then /// concatenate the result back. static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } /// Break a 512-bit integer operation into two new 256-bit ones and then /// concatenate the result back. static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl); SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl); SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl); MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); if (VT == MVT::i16 || VT == MVT::i32) return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); if (VT.getScalarType() == MVT::i1) return DAG.getNode(ISD::XOR, SDLoc(Op), VT, Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return split256IntArith(Op, DAG); } static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDValue X = Op.getOperand(0), Y = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); if (VT.getScalarType() == MVT::i1) { SDLoc dl(Op); switch (Opcode) { default: llvm_unreachable("Expected saturated arithmetic opcode"); case ISD::UADDSAT: case ISD::SADDSAT: // *addsat i1 X, Y --> X | Y return DAG.getNode(ISD::OR, dl, VT, X, Y); case ISD::USUBSAT: case ISD::SSUBSAT: // *subsat i1 X, Y --> X & ~Y return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT)); } } if (VT.is128BitVector()) { // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDLoc DL(Op); if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) { // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y); SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT); return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add); } if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) { // usubsat X, Y --> (X >u Y) ? X - Y : 0 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); } // Use default expansion. return SDValue(); } assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return split256IntArith(Op, DAG); } static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) { // Since X86 does not have CMOV for 8-bit integer, we don't convert // 8-bit integer abs to NEG and CMOV. SDLoc DL(Op); SDValue N0 = Op.getOperand(0); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), DAG.getConstant(0, DL, VT), N0); SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8), SDValue(Neg.getNode(), 1)}; return DAG.getNode(X86ISD::CMOV, DL, VT, Ops); } // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X). if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) { SDLoc DL(Op); SDValue Src = Op.getOperand(0); SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src); return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src); } if (VT.is256BitVector() && !Subtarget.hasInt256()) { assert(VT.isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntUnary(Op, DAG); } // Default to expand. return SDValue(); } static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // For AVX1 cases, split to use legal ops (everything but v4i64). if (VT.getScalarType() != MVT::i64 && VT.is256BitVector()) return split256IntArith(Op, DAG); SDLoc DL(Op); unsigned Opcode = Op.getOpcode(); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit, // using the SMIN/SMAX instructions and flipping the signbit back. if (VT == MVT::v8i16) { assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) && "Unexpected MIN/MAX opcode"); SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT); N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign); N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign); Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX); SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1); return DAG.getNode(ISD::XOR, DL, VT, Result, Sign); } // Else, expand to a compare/select. ISD::CondCode CC; switch (Opcode) { case ISD::SMIN: CC = ISD::CondCode::SETLT; break; case ISD::SMAX: CC = ISD::CondCode::SETGT; break; case ISD::UMIN: CC = ISD::CondCode::SETULT; break; case ISD::UMAX: CC = ISD::CondCode::SETUGT; break; default: llvm_unreachable("Unknown MINMAX opcode"); } SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC); return DAG.getSelect(DL, VT, Cond, N0, N1); } static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); if (VT.getScalarType() == MVT::i1) return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); // Decompose 256-bit ops into 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return split256IntArith(Op, DAG); SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16 // vector pairs, multiply and truncate. if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); return DAG.getNode( ISD::TRUNCATE, dl, VT, DAG.getNode(ISD::MUL, dl, ExVT, DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A), DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B))); } MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); // Extract the lo/hi parts to any extend to i16. // We're going to mask off the low byte of each result element of the // pmullw, so it doesn't matter what's in the high byte of each 16-bit // element. SDValue Undef = DAG.getUNDEF(VT); SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef)); SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef)); SDValue BLo, BHi; if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { // If the LHS is a constant, manually unpackl/unpackh. SmallVector LoOps, HiOps; for (unsigned i = 0; i != NumElts; i += 16) { for (unsigned j = 0; j != 8; ++j) { LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, MVT::i16)); HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, MVT::i16)); } } BLo = DAG.getBuildVector(ExVT, dl, LoOps); BHi = DAG.getBuildVector(ExVT, dl, HiOps); } else { BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef)); BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef)); } // Multiply, mask the lower 8bits of the lo/hi results and pack. SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT)); RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT)); return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. if (VT == MVT::v4i32) { assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() && "Should not custom lower when pmulld is available!"); // Extract the odd parts. static const int UnpackMask[] = { 1, -1, 3, -1 }; SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); // Multiply the even parts. SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, A), DAG.getBitcast(MVT::v2i64, B)); // Now multiply odd parts. SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, Aodds), DAG.getBitcast(MVT::v2i64, Bodds)); Evens = DAG.getBitcast(VT, Evens); Odds = DAG.getBitcast(VT, Odds); // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. static const int ShufMask[] = { 0, 4, 2, 6 }; return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"); assert(!Subtarget.hasDQI() && "DQI should use MULLQ"); // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); // // AloBlo = pmuludq(a, b); // AloBhi = pmuludq(a, Bhi); // AhiBlo = pmuludq(Ahi, b); // // Hi = psllqi(AloBhi + AhiBlo, 32); // return AloBlo + Hi; KnownBits AKnown = DAG.computeKnownBits(A); KnownBits BKnown = DAG.computeKnownBits(B); APInt LowerBitsMask = APInt::getLowBitsSet(64, 32); bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero); bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero); APInt UpperBitsMask = APInt::getHighBitsSet(64, 32); bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero); bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero); SDValue Zero = DAG.getConstant(0, dl, VT); // Only multiply lo/hi halves that aren't known to be zero. SDValue AloBlo = Zero; if (!ALoIsZero && !BLoIsZero) AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); SDValue AloBhi = Zero; if (!ALoIsZero && !BHiIsZero) { SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); } SDValue AhiBlo = Zero; if (!AHiIsZero && !BLoIsZero) { SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); } SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo); Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG); return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi); } static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); bool IsSigned = Op->getOpcode() == ISD::MULHS; unsigned NumElts = VT.getVectorNumElements(); SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); // Decompose 256-bit ops into 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return split256IntArith(Op, DAG); if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) { assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())); // PMULxD operations multiply each even value (starting at 0) of LHS with // the related value of RHS and produce a widen result. // E.g., PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> // // In other word, to have all the results, we need to perform two PMULxD: // 1. one with the even values. // 2. one with the odd values. // To achieve #2, with need to place the odd values at an even position. // // Place the odd value at an even position (basically, shift all values 1 // step to the left): const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1}; // => SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A, makeArrayRef(&Mask[0], NumElts)); // => SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B, makeArrayRef(&Mask[0], NumElts)); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2); unsigned Opcode = (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ; // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, DAG.getBitcast(MulVT, A), DAG.getBitcast(MulVT, B))); // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, DAG.getBitcast(MulVT, Odd0), DAG.getBitcast(MulVT, Odd1))); // Shuffle it back into the right order. SmallVector ShufMask(NumElts); for (int i = 0; i != (int)NumElts; ++i) ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1; SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask); // If we have a signed multiply but no PMULDQ fix up the result of an // unsigned multiply. if (IsSigned && !Subtarget.hasSSE41()) { SDValue Zero = DAG.getConstant(0, dl, VT); SDValue T1 = DAG.getNode(ISD::AND, dl, VT, DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B); SDValue T2 = DAG.getNode(ISD::AND, dl, VT, DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A); SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup); } return Res; } // Only i8 vectors should need custom lowering after this. assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && "Unsupported vector type"); // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply, // logical shift down the upper half and pack back to i8. // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack // and then ashr/lshr the upper bits down to the lower bits before multiply. unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A); SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B); SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB); Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG); return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); } // For signed 512-bit vectors, split into 256-bit vectors to allow the // sign-extension to occur. if (VT == MVT::v64i8 && IsSigned) return split512IntArith(Op, DAG); // Signed AVX2 implementation - extend xmm subvectors to ymm. if (VT == MVT::v32i8 && IsSigned) { MVT ExVT = MVT::v16i16; SDValue ALo = extract128BitVector(A, 0, DAG, dl); SDValue BLo = extract128BitVector(B, 0, DAG, dl); SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl); SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl); ALo = DAG.getNode(ExAVX, dl, ExVT, ALo); BLo = DAG.getNode(ExAVX, dl, ExVT, BLo); AHi = DAG.getNode(ExAVX, dl, ExVT, AHi); BHi = DAG.getNode(ExAVX, dl, ExVT, BHi); SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG); Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG); // Bitcast back to VT and then pack all the even elements from Lo and Hi. // Shuffle lowering should turn this into PACKUS+PERMQ Lo = DAG.getBitcast(VT, Lo); Hi = DAG.getBitcast(VT, Hi); return DAG.getVectorShuffle(VT, dl, Lo, Hi, { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}); } // For signed v16i8 and all unsigned vXi8 we will unpack the low and high // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies, // shift the results and pack the half lane results back together. MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}; // Extract the lo parts and zero/sign extend to i16. // Only use SSE4.1 instructions for signed v16i8 where using unpack requires // shifts to sign extend. Using unpack for unsigned only requires an xor to // create zeros and a copy due to tied registers contraints pre-avx. But using // zero_extend_vector_inreg would require an additional pshufd for the high // part. SDValue ALo, AHi; if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) { ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A); AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask); AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi); } else if (IsSigned) { ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A)); AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A)); ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG); AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG); } else { ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, DAG.getConstant(0, dl, VT))); AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, DAG.getConstant(0, dl, VT))); } SDValue BLo, BHi; if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { // If the LHS is a constant, manually unpackl/unpackh and extend. SmallVector LoOps, HiOps; for (unsigned i = 0; i != NumElts; i += 16) { for (unsigned j = 0; j != 8; ++j) { SDValue LoOp = B.getOperand(i + j); SDValue HiOp = B.getOperand(i + j + 8); if (IsSigned) { LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16); HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16); } else { LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16); HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16); } LoOps.push_back(LoOp); HiOps.push_back(HiOp); } } BLo = DAG.getBuildVector(ExVT, dl, LoOps); BHi = DAG.getBuildVector(ExVT, dl, HiOps); } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) { BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B); BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask); BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi); } else if (IsSigned) { BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B)); BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B)); BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG); BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG); } else { BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, DAG.getConstant(0, dl, VT))); BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, DAG.getConstant(0, dl, VT))); } // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and // pack back to vXi8. SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG); RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG); // Bitcast back to VT and then pack all the even elements from Lo and Hi. return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget.isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); assert(VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering"); RTLIB::Libcall LC; bool isSigned; switch (Op->getOpcode()) { default: llvm_unreachable("Unexpected request for libcall!"); case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; } SDLoc dl(Op); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { EVT ArgVT = Op->getOperand(i).getValueType(); assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && "Unexpected argument type for lowering"); SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); Entry.Node = StackPtr; InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), /* Alignment = */ 16); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(InChain) .setLibCallee( getLibcallCallingConv(LC), static_cast(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee, std::move(Args)) .setInRegister() .setSExtResult(isSigned) .setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); return DAG.getBitcast(VT, CallInfo.first); } // Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { if (VT.getScalarSizeInBits() < 16) return false; if (VT.is512BitVector() && Subtarget.hasAVX512() && (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) return true; bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) || (VT.is256BitVector() && Subtarget.hasInt256()); bool AShift = LShift && (Subtarget.hasAVX512() || (VT != MVT::v2i64 && VT != MVT::v4i64)); return (Opcode == ISD::SRA) ? AShift : LShift; } // The shift amount is a variable, but it is the same for all vector lanes. // These instructions are defined together with shift-immediate. static bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); } // Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16) return false; // vXi16 supported only on AVX-512, BWI if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) return false; if (Subtarget.hasAVX512()) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; return (Opcode == ISD::SRA) ? AShift : LShift; } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false); auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); SDValue Ex = DAG.getBitcast(ExVT, R); // ashr(R, 63) === cmp_slt(R, 0) if (ShiftAmt == 63 && Subtarget.hasSSE42()) { assert((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"); return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R); } if (ShiftAmt >= 32) { // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, ShiftAmt - 32, DAG); if (VT == MVT::v2i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); if (VT == MVT::v4i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {9, 1, 11, 3, 13, 5, 15, 7}); } else { // SRA upper i32, SRL whole i64 and select lower i32. SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, ShiftAmt, DAG); SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); Lower = DAG.getBitcast(ExVT, Lower); if (VT == MVT::v2i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); if (VT == MVT::v4i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {8, 1, 10, 3, 12, 5, 14, 7}); } return DAG.getBitcast(VT, Ex); }; // Optimize shl/srl/sra with constant shift amount. APInt APIntShiftAmt; if (!isConstantSplat(Amt, APIntShiftAmt)) return SDValue(); // If the shift amount is out of range, return undef. if (APIntShiftAmt.uge(VT.getScalarSizeInBits())) return DAG.getUNDEF(VT); uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); // i64 SRA needs to be performed as partial shifts. if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || (Subtarget.hasInt256() && VT == MVT::v4i64)) && Op.getOpcode() == ISD::SRA) return ArithmeticShiftRight64(ShiftAmt); if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) || VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); // Simple i8 add case if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) return DAG.getNode(ISD::ADD, dl, VT, R, R); // ashr(R, 7) === cmp_slt(R, 0) if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { SDValue Zeros = DAG.getConstant(0, dl, VT); if (VT.is512BitVector()) { assert(VT == MVT::v64i8 && "Unexpected element type!"); SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT); return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP); } return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); } // XOP can shift v16i8 directly instead of as shift v8i16 + mask. if (VT == MVT::v16i8 && Subtarget.hasXOP()) return SDValue(); if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt); return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R, ShiftAmt, DAG); SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); } if (Op.getOpcode() == ISD::SRA) { // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } llvm_unreachable("Unknown shift opcode."); } return SDValue(); } static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false); unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true); if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) { if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) { MVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG); } // vXi8 shifts - shift as v8i16 + mask result. if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) || (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) || VT == MVT::v64i8) && !Subtarget.hasXOP()) { unsigned NumElts = VT.getVectorNumElements(); MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2); if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) { unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL); unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false); BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); // Create the mask using vXi16 shifts. For shift-rights we need to move // the upper byte down before splatting the vXi8 mask. SDValue BitMask = DAG.getConstant(-1, dl, ExtVT); BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask, BaseShAmt, Subtarget, DAG); if (Opcode != ISD::SHL) BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask, 8, DAG); BitMask = DAG.getBitcast(VT, BitMask); BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask, SmallVector(NumElts, 0)); SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, DAG.getBitcast(ExtVT, R), BaseShAmt, Subtarget, DAG); Res = DAG.getBitcast(VT, Res); Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask); if (Opcode == ISD::SRA) { // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask) // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW. SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT); SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt, Subtarget, DAG); SignMask = DAG.getBitcast(VT, SignMask); Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask); } return Res; } } } // Check cases (mainly 32-bit) where i64 is expanded into high and low parts. if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); unsigned Ratio = 64 / Amt.getScalarValueSizeInBits(); std::vector Vals(Ratio); for (unsigned i = 0; i != Ratio; ++i) Vals[i] = Amt.getOperand(i); for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) { for (unsigned j = 0; j != Ratio; ++j) if (Vals[j] != Amt.getOperand(i + j)) return SDValue(); } if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); } return SDValue(); } // Convert a shift/rotate left amount to a multiplication scale factor. static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Amt.getSimpleValueType(); if (!(VT == MVT::v8i16 || VT == MVT::v4i32 || (Subtarget.hasInt256() && VT == MVT::v16i16) || (!Subtarget.hasAVX512() && VT == MVT::v16i8))) return SDValue(); if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { SmallVector Elts; MVT SVT = VT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); APInt One(SVTBits, 1); unsigned NumElems = VT.getVectorNumElements(); for (unsigned i = 0; i != NumElems; ++i) { SDValue Op = Amt->getOperand(i); if (Op->isUndef()) { Elts.push_back(Op); continue; } ConstantSDNode *ND = cast(Op); APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); uint64_t ShAmt = C.getZExtValue(); if (ShAmt >= SVTBits) { Elts.push_back(DAG.getUNDEF(SVT)); continue; } Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT)); } return DAG.getBuildVector(VT, dl, Elts); } // If the target doesn't support variable shifts, use either FP conversion // or integer multiplication to avoid shifting each element individually. if (VT == MVT::v4i32) { Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, DAG.getConstant(0x3f800000U, dl, VT)); Amt = DAG.getBitcast(MVT::v4f32, Amt); return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt); } // AVX2 can more effectively perform this as a zext/trunc to/from v8i32. if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) { SDValue Z = DAG.getConstant(0, dl, VT); SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z)); SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z)); Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG); Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG); if (Subtarget.hasSSE41()) return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo), DAG.getBitcast(VT, Hi), {0, 2, 4, 6, 8, 10, 12, 14}); } return SDValue(); } static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned EltSizeInBits = VT.getScalarSizeInBits(); bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); unsigned Opc = Op.getOpcode(); unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true); unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false); assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"); if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) return V; if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) return V; if (SupportedVectorVarShift(VT, Subtarget, Opc)) return Op; // XOP has 128-bit variable logical/arithmetic shifts. // +ve/-ve Amt = shift left/right. if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) { if (Opc == ISD::SRL || Opc == ISD::SRA) { SDValue Zero = DAG.getConstant(0, dl, VT); Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); } if (Opc == ISD::SHL || Opc == ISD::SRL) return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); if (Opc == ISD::SRA) return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); } // 2i64 vector logical shifts can efficiently avoid scalarization - do the // shifts per-lane and then shuffle the partial results back together. if (VT == MVT::v2i64 && Opc != ISD::SRA) { // Splat the shift amounts so the scalar shifts above will catch it. SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } // i64 vector arithmetic shift can be emulated with the transform: // M = lshr(SIGN_MASK, Amt) // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) && Opc == ISD::SRA) { SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT); SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); R = DAG.getNode(ISD::XOR, dl, VT, R, M); R = DAG.getNode(ISD::SUB, dl, VT, R, M); return R; } // If possible, lower this shift as a sequence of two shifts by // constant plus a BLENDing shuffle instead of scalarizing it. // Example: // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) // // Could be rewritten as: // (v4i32 (MOVSS (srl A, ), (srl A, ))) // // The advantage is that the two shifts from the example would be // lowered as X86ISD::VSRLI nodes in parallel before blending. if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) { SDValue Amt1, Amt2; unsigned NumElts = VT.getVectorNumElements(); SmallVector ShuffleMask; for (unsigned i = 0; i != NumElts; ++i) { SDValue A = Amt->getOperand(i); if (A.isUndef()) { ShuffleMask.push_back(SM_SentinelUndef); continue; } if (!Amt1 || Amt1 == A) { ShuffleMask.push_back(i); Amt1 = A; continue; } if (!Amt2 || Amt2 == A) { ShuffleMask.push_back(i + NumElts); Amt2 = A; continue; } break; } // Only perform this blend if we can perform it without loading a mask. if (ShuffleMask.size() == NumElts && Amt1 && Amt2 && (VT != MVT::v16i16 || is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) && (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL || canWidenShuffleElements(ShuffleMask))) { auto *Cst1 = dyn_cast(Amt1); auto *Cst2 = dyn_cast(Amt2); if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) && Cst2->getAPIntValue().ult(EltSizeInBits)) { SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, Cst1->getZExtValue(), DAG); SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, Cst2->getZExtValue(), DAG); return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask); } } } // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. if (Opc == ISD::SHL) if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG)) return DAG.getNode(ISD::MUL, dl, VT, R, Scale); // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt). if (Opc == ISD::SRL && ConstantAmt && (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) { SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT); SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt); if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) { SDValue Zero = DAG.getConstant(0, dl, VT); SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ); SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale); return DAG.getSelect(dl, VT, ZAmt, R, Res); } } // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt). // TODO: Special case handling for shift by 0/1, really we can afford either // of these cases in pre-SSE41/XOP/AVX512 but not both. if (Opc == ISD::SRA && ConstantAmt && (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) && ((Subtarget.hasSSE41() && !Subtarget.hasXOP() && !Subtarget.hasAVX512()) || DAG.isKnownNeverZero(Amt))) { SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT); SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt); if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) { SDValue Amt0 = DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ); SDValue Amt1 = DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ); SDValue Sra1 = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG); SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale); Res = DAG.getSelect(dl, VT, Amt0, R, Res); return DAG.getSelect(dl, VT, Amt1, Sra1, Res); } } // v4i32 Non Uniform Shifts. // If the shift amount is constant we can shift each lane using the SSE2 // immediate shifts, else we need to zero-extend each lane to the lower i64 // and shift using the SSE2 variable shifts. // The separate results can then be blended together. if (VT == MVT::v4i32) { SDValue Amt0, Amt1, Amt2, Amt3; if (ConstantAmt) { Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); } else { // The SSE2 shifts use the lower i64 as the same shift amount for // all lanes and the upper i64 is ignored. On AVX we're better off // just zero-extending, but for SSE just duplicating the top 16-bits is // cheaper and has the same effect for out of range values. if (Subtarget.hasAVX()) { SDValue Z = DAG.getConstant(0, dl, VT); Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); } else { SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt); SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, {4, 5, 6, 7, -1, -1, -1, -1}); Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, {0, 1, 1, 1, -1, -1, -1, -1}); Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, {2, 3, 3, 3, -1, -1, -1, -1}); Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23, {0, 1, 1, 1, -1, -1, -1, -1}); Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23, {2, 3, 3, 3, -1, -1, -1, -1}); } } unsigned ShOpc = ConstantAmt ? Opc : X86OpcV; SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0)); SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1)); SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2)); SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3)); // Merge the shifted lane results optimally with/without PBLENDW. // TODO - ideally shuffle combining would handle this. if (Subtarget.hasSSE41()) { SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); } SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5}); SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7}); return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7}); } // It's worth extending once and using the vXi16/vXi32 shifts for smaller // types, but without AVX512 the extra overheads to get from vXi8 to vXi32 // make the existing SSE solution better. // NOTE: We honor prefered vector width before promoting to 512-bits. if ((Subtarget.hasInt256() && VT == MVT::v8i16) || (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) || (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) || (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) || (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) { assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && "Unexpected vector type"); MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32; MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements()); unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; R = DAG.getNode(ExtOpc, dl, ExtVT, R); Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Opc, dl, ExtVT, R, Amt)); } // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI. if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) && (VT == MVT::v16i8 || VT == MVT::v64i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) && !Subtarget.hasXOP()) { int NumElts = VT.getVectorNumElements(); SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8); // Extend constant shift amount to vXi16 (it doesn't matter if the type // isn't legal). MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT); Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt); Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt); assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && "Constant build vector expected"); if (VT == MVT::v16i8 && Subtarget.hasInt256()) { R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT) : DAG.getZExtOrTrunc(R, dl, ExVT); R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt); R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8); return DAG.getZExtOrTrunc(R, dl, VT); } SmallVector LoAmt, HiAmt; for (int i = 0; i != NumElts; i += 16) { for (int j = 0; j != 8; ++j) { LoAmt.push_back(Amt.getOperand(i + j)); HiAmt.push_back(Amt.getOperand(i + j + 8)); } } MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2); SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt); SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt); SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R)); SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R)); LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8); HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8); LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA); HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA); LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8); HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8); return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { if (VT.is512BitVector()) { // On AVX512BW targets we make use of the fact that VSELECT lowers // to a masked blend which selects bytes based just on the sign bit // extracted to a mask. MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel, ISD::SETGT); return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } else if (Subtarget.hasSSE41()) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = DAG.getConstant(0, dl, SelVT); SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); return DAG.getSelect(dl, SelVT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; // We can safely do this using i16 shifts as we're only interested in // the 3 lower bits of each byte. Amt = DAG.getBitcast(ExtVT, Amt); Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG); Amt = DAG.getBitcast(VT, Amt); if (Opc == ISD::SHL || Opc == ISD::SRL) { // r = VSELECT(r, shift(r, 4), a); SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT)); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 2), a); M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT)); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // return VSELECT(r, shift(r, 1), a); M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT)); R = SignBitSelect(VT, Amt, M, R); return R; } if (Opc == ISD::SRA) { // For SRA we need to unpack each byte to the higher byte of a i16 vector // so we can correctly sign extend. We don't care what happens to the // lower byte. SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt); SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt); SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R); SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); RHi = DAG.getBitcast(ExtVT, RHi); // r = VSELECT(r, shift(r, 4), a); SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG); SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // a += a ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); // r = VSELECT(r, shift(r, 2), a); MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG); MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // a += a ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); // r = VSELECT(r, shift(r, 1), a); MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG); MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // Logical shift the result back to the lower byte, leaving a zero upper // byte meaning that we can safely pack with PACKUSWB. RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG); RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG); return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } } if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = DAG.getConstant(0, dl, VT); SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z); SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z); SDValue RLo = getUnpackl(DAG, dl, VT, Z, R); SDValue RHi = getUnpackh(DAG, dl, VT, Z, R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); RHi = DAG.getBitcast(ExtVT, RHi); SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo); SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi); Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG); Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG); return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); } if (VT == MVT::v8i16) { // If we have a constant shift amount, the non-SSE41 path is best as // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW. bool UseSSE41 = Subtarget.hasSSE41() && !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. if (UseSSE41) { MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); Sel = DAG.getBitcast(ExtVT, Sel); return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1)); } // On pre-SSE41 targets we splat the sign bit - a negative value will // set all bits of the lanes to true and VSELECT uses that in // its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue C = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG); return DAG.getSelect(dl, VT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 12; if (UseSSE41) { // On SSE41 targets we need to replicate the shift mask in both // bytes for PBLENDVB. Amt = DAG.getNode( ISD::OR, dl, VT, getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG), getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG)); } else { Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG); } // r = VSELECT(r, shift(r, 8), a); SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 4), a); M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 2), a); M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // return VSELECT(r, shift(r, 1), a); M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG); R = SignBitSelect(Amt, M, R); return R; } // Decompose 256-bit shifts into 128-bit shifts. if (VT.is256BitVector()) return split256IntArith(Op, DAG); return SDValue(); } static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.isVector() && "Custom lowering only for vector rotates!"); SDLoc DL(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); int NumElts = VT.getVectorNumElements(); // Check for constant splat rotation amount. APInt UndefElts; SmallVector EltBits; int CstSplatIndex = -1; if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) for (int i = 0; i != NumElts; ++i) if (!UndefElts[i]) { if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) { CstSplatIndex = i; continue; } CstSplatIndex = -1; break; } // AVX512 implicitly uses modulo rotation amounts. if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) { // Attempt to rotate by immediate. if (0 <= CstSplatIndex) { unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); return DAG.getNode(Op, DL, VT, R, DAG.getConstant(RotateAmt, DL, MVT::i8)); } // Else, fall-back on VPROLV/VPRORV. return Op; } assert((Opcode == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL. // XOP implicitly uses modulo rotation amounts. if (Subtarget.hasXOP()) { if (VT.is256BitVector()) return split256IntArith(Op, DAG); assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); // Attempt to rotate by immediate. if (0 <= CstSplatIndex) { uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); return DAG.getNode(X86ISD::VROTLI, DL, VT, R, DAG.getConstant(RotateAmt, DL, MVT::i8)); } // Use general rotate by variable (per-element). return Op; } // Split 256-bit integers on pre-AVX2 targets. if (VT.is256BitVector() && !Subtarget.hasAVX2()) return split256IntArith(Op, DAG); assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"); // Rotate by an uniform constant - expand back to shifts. if (0 <= CstSplatIndex) return SDValue(); bool IsSplatAmt = DAG.isSplatValue(Amt); // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by // the amount bit. if (EltSizeInBits == 8 && !IsSplatAmt) { if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) return SDValue(); // We don't need ModuloAmt here as we just peek at individual bits. MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { if (Subtarget.hasSSE41()) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = DAG.getConstant(0, DL, SelVT); SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel); return DAG.getSelect(DL, SelVT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; // We can safely do this using i16 shifts as we're only interested in // the 3 lower bits of each byte. Amt = DAG.getBitcast(ExtVT, Amt); Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT)); Amt = DAG.getBitcast(VT, Amt); // r = VSELECT(r, rot(r, 4), a); SDValue M; M = DAG.getNode( ISD::OR, DL, VT, DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)), DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT))); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt); // r = VSELECT(r, rot(r, 2), a); M = DAG.getNode( ISD::OR, DL, VT, DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)), DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT))); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt); // return VSELECT(r, rot(r, 1), a); M = DAG.getNode( ISD::OR, DL, VT, DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)), DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT))); return SignBitSelect(VT, Amt, M, R); } // ISD::ROT* uses modulo rotate amounts. Amt = DAG.getNode(ISD::AND, DL, VT, Amt, DAG.getConstant(EltSizeInBits - 1, DL, VT)); bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) && SupportedVectorVarShift(VT, Subtarget, ISD::SRL); // Fallback for splats + all supported variable shifts. // Fallback for non-constants AVX2 vXi16 as well. if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) { SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT); AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt); SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt); SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR); return DAG.getNode(ISD::OR, DL, VT, SHL, SRL); } // As with shifts, convert the rotation amount to a multiplication factor. SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG); assert(Scale && "Failed to convert ROTL amount to scale"); // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results. if (EltSizeInBits == 16) { SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale); SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale); return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits // that can then be OR'd with the lower 32-bits. assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected"); static const int OddMask[] = {1, -1, 3, -1}; SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask); SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask); SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, R), DAG.getBitcast(MVT::v2i64, Scale)); SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, R13), DAG.getBitcast(MVT::v2i64, Scale13)); Res02 = DAG.getBitcast(VT, Res02); Res13 = DAG.getBitcast(VT, Res13); return DAG.getNode(ISD::OR, DL, VT, DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}), DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7})); } /// Returns true if the operand type is exactly twice the native width, and /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit(); if (OpWidth == 128) return Subtarget.hasCmpxchg16b(); return false; } // TODO: In 32-bit mode, use MOVLPS when SSE1 is available? // TODO: In 32-bit mode, use FISTP when X87 is available? bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { Type *MemType = SI->getValueOperand()->getType(); bool NoImplicitFloatOps = SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) return false; return needsCmpXchgNb(MemType); } // Note: this turns large loads into lock cmpxchg8b/16b. // TODO: In 32-bit mode, use MOVLPS when SSE1 is available? TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { Type *MemType = LI->getType(); // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we // can use movq to do the load. If we have X87 we can load into an 80-bit // X87 register and store it to a stack temporary. bool NoImplicitFloatOps = LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && !Subtarget.useSoftFloat() && !NoImplicitFloatOps && (Subtarget.hasSSE2() || Subtarget.hasX87())) return AtomicExpansionKind::None; return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. if (MemType->getPrimitiveSizeInBits() > NativeWidth) { return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } AtomicRMWInst::BinOp Op = AI->getOperation(); switch (Op) { default: llvm_unreachable("Unknown atomic operation"); case AtomicRMWInst::Xchg: case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. return AtomicExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. return !AI->use_empty() ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return AtomicExpansionKind::CmpXChg; } } LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually // harmful as it introduces a mfence. if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; // If this is a canonical idempotent atomicrmw w/no uses, we have a better // lowering available in lowerAtomicArith. // TODO: push more cases through this path. if (auto *C = dyn_cast(AI->getValOperand())) if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && AI->use_empty()) return nullptr; auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); // Before the load we need a fence. Here is an example lifted from // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence // is required: // Thread 0: // x.store(1, relaxed); // r1 = y.fetch_add(0, release); // Thread 1: // y.fetch_add(42, acquire); // r2 = x.load(relaxed); // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. if (SSID == SyncScope::SingleThread) // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; if (!Subtarget.hasMFence()) // FIXME: it might make sense to use a locked operation here but on a // different cache-line to prevent cache-line bouncing. In practice it // is probably a small win, and x86 processors without mfence are rare // enough that we do not bother. return nullptr; Function *MFence = llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(), AI->getType()->getPrimitiveSizeInBits()); Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; } /// Emit a locked operation on a stack location which does not change any /// memory location, but does involve a lock prefix. Location is chosen to be /// a) very likely accessed only by a single thread to minimize cache traffic, /// and b) definitely dereferenceable. Returns the new Chain result. static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, SDLoc DL) { // Implementation notes: // 1) LOCK prefix creates a full read/write reordering barrier for memory // operations issued by the current processor. As such, the location // referenced is not relevant for the ordering properties of the instruction. // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions // 2) Using an immediate operand appears to be the best encoding choice // here since it doesn't require an extra register. // 3) OR appears to be very slightly faster than ADD. (Though, the difference // is small enough it might just be measurement noise.) // 4) When choosing offsets, there are several contributing factors: // a) If there's no redzone, we default to TOS. (We could allocate a cache // line aligned stack object to improve this case.) // b) To minimize our chances of introducing a false dependence, we prefer // to offset the stack usage from TOS slightly. // c) To minimize concerns about cross thread stack usage - in particular, // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which // captures state in the TOS frame and accesses it from many threads - // we want to use an offset such that the offset is in a distinct cache // line from the TOS frame. // // For a general discussion of the tradeoffs and benchmark results, see: // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ auto &MF = DAG.getMachineFunction(); auto &TFL = *Subtarget.getFrameLowering(); const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0; if (Subtarget.is64Bit()) { SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); SDValue Ops[] = { DAG.getRegister(X86::RSP, MVT::i64), // Base DAG.getTargetConstant(1, DL, MVT::i8), // Scale DAG.getRegister(0, MVT::i64), // Index DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp DAG.getRegister(0, MVT::i16), // Segment. Zero, Chain}; SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops); return SDValue(Res, 1); } SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); SDValue Ops[] = { DAG.getRegister(X86::ESP, MVT::i32), // Base DAG.getTargetConstant(1, DL, MVT::i8), // Scale DAG.getRegister(0, MVT::i32), // Index DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp DAG.getRegister(0, MVT::i16), // Segment. Zero, Chain }; SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops); return SDValue(Res, 1); } static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast( cast(Op.getOperand(1))->getZExtValue()); SyncScope::ID FenceSSID = static_cast( cast(Op.getOperand(2))->getZExtValue()); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && FenceSSID == SyncScope::System) { if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); return emitLockedStackOp(DAG, Subtarget, Chain, dl); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); } static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT T = Op.getSimpleValueType(); SDLoc DL(Op); unsigned Reg = 0; unsigned size = 0; switch(T.SimpleTy) { default: llvm_unreachable("Invalid value type!"); case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; case MVT::i64: assert(Subtarget.is64Bit() && "Node not type legal!"); Reg = X86::RAX; size = 8; break; } SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), DAG.getTargetConstant(size, DL, MVT::i8), cpIn.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, MVT::i32, cpOut.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), cpOut, Success, EFLAGS.getValue(1)); } // Create MOVMSKB, taking into account whether we need to split for AVX1. static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT InVT = V.getSimpleValueType(); if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(V, DL); Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo); Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi); Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, DAG.getConstant(16, DL, MVT::i8)); return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi); } return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); } static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each // half to v32i1 and concatenating the result. if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) { assert(!Subtarget.is64Bit() && "Expected 32-bit mode"); assert(Subtarget.hasBWI() && "Expected BWI target"); SDLoc dl(Op); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src, DAG.getIntPtrConstant(0, dl)); Lo = DAG.getBitcast(MVT::v32i1, Lo); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src, DAG.getIntPtrConstant(1, dl)); Hi = DAG.getBitcast(MVT::v32i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } // Custom splitting for BWI types when AVX512F is available but BWI isn't. if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) { SDLoc dl(Op); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(), DstVT.getVectorNumElements() / 2); Lo = DAG.getBitcast(CastVT, Lo); Hi = DAG.getBitcast(CastVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi); } // Use MOVMSK for vector to scalar conversion to prevent scalarization. if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) { assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512"); MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8; SDLoc DL(Op); SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT); V = getPMOVMSKB(DL, V, DAG, Subtarget); return DAG.getZExtOrTrunc(V, DL, DstVT); } if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64 && DstVT != MVT::i64 && !(DstVT == MVT::x86mmx && SrcVT.isVector())) // This conversion needs to be expanded. return SDValue(); SDLoc dl(Op); if (SrcVT.isVector()) { // Widen the vector in input in the case of MVT::v2i32. // Example: from MVT::v2i32 to MVT::v4i32. MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(), SrcVT.getVectorNumElements() * 2); Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, DAG.getUNDEF(SrcVT)); } else { assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && "Unexpected source type in LowerBITCAST"); Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); } MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64; Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src); if (DstVT == MVT::x86mmx) return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src, DAG.getIntPtrConstant(0, dl)); } assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() && Subtarget.hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && "Unexpected custom BITCAST"); // i64 <=> MMX conversions are Legal. if (SrcVT==MVT::i64 && DstVT.isVector()) return Op; if (DstVT==MVT::i64 && SrcVT.isVector()) return Op; // MMX <=> MMX conversions are Legal. if (SrcVT.isVector() && DstVT.isVector()) return Op; // All other conversions need to be expanded. return SDValue(); } /// Compute the horizontal sum of bytes in V for the elements of VT. /// /// Requires V to be a byte vector and VT to be an integer vector type with /// wider elements than V's type. The width of the elements of VT determines /// how many bytes of V are summed horizontally to produce each element of the /// result. static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(V); MVT ByteVecVT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); assert(ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."); assert(EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"); unsigned VecSize = VT.getSizeInBits(); assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"); // PSADBW instruction horizontally add all bytes and leave the result in i64 // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } if (EltVT == MVT::i32) { // We unpack the low half and high half into i32s interleaved with zeros so // that we can use PSADBW to horizontally sum them. The most useful part of // this is that it lines up the results of two PSADBW instructions to be // two v2i64 vectors which concatenated are the 4 population counts. We can // then use PACKUSWB to shrink and concatenate them into a v4i32 again. SDValue Zeros = DAG.getConstant(0, DL, VT); SDValue V32 = DAG.getBitcast(VT, V); SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros); SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros); // Do the horizontal sums into two v2i64s. Zeros = DAG.getConstant(0, DL, ByteVecVT); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, DAG.getBitcast(ShortVecVT, Low), DAG.getBitcast(ShortVecVT, High)); return DAG.getBitcast(VT, V); } // The only element type left is i16. assert(EltVT == MVT::i16 && "Unknown how to handle type"); // To obtain pop count for each i16 element starting from the pop count for // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s // right by 8. It is important to shift as i16s as i8 vector shift isn't // directly supported. SDValue ShifterV = DAG.getConstant(8, DL, VT); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV); V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), DAG.getBitcast(ByteVecVT, V)); return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV); } static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); int NumElts = VT.getVectorNumElements(); (void)EltVT; assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."); // Implement a lookup table in register by using an algorithm based on: // http://wm.ite.pl/articles/sse-popcount.html // // The general idea is that every lower byte nibble in the input vector is an // index into a in-register pre-computed pop count table. We then split up the // input vector in two new ones: (1) a vector with only the shifted-right // higher nibbles for each byte and (2) a vector with the lower nibbles (and // masked out higher ones) for each byte. PSHUFB is used separately with both // to index the in-register table. Next, both are added and the result is a // i8 vector where each element contains the pop count for input byte. const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; SmallVector LUTVec; for (int i = 0; i < NumElts; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec); SDValue M0F = DAG.getConstant(0x0F, DL, VT); // High nibbles SDValue FourV = DAG.getConstant(4, DL, VT); SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV); // Low nibbles SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F); // The input vector is used as the shuffle mask that index elements into the // LUT. After counting low and high nibbles, add the vector to obtain the // final pop count per i8 element. SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles); SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles); return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt); } // Please ensure that any codegen change from LowerVectorCTPOP is reflected in // updated cost models in X86TTIImpl::getIntrinsicInstrCost. static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && "Unknown CTPOP type to handle"); SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. if (Subtarget.hasVPOPCNTDQ()) { unsigned NumElems = VT.getVectorNumElements(); assert((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"); if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) { MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0); Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op); return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); } } // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); // Decompose 512-bit ops into smaller 256-bit ops. if (VT.is512BitVector() && !Subtarget.hasBWI()) return Lower512IntUnary(Op, DAG); // For element types greater than i8, do vXi8 pop counts and a bytesum. if (VT.getScalarType() != MVT::i8) { MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); SDValue ByteOp = DAG.getBitcast(ByteVT, Op0); SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp); return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG); } // We can't use the fast LUT approach, so fall back on LegalizeDAG. if (!Subtarget.hasSSSE3()) return SDValue(); return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."); return LowerVectorCTPOP(Op, Subtarget, DAG); } static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); SDLoc DL(Op); // For scalars, its still beneficial to transfer to/from the SIMD unit to // perform the BITREVERSE. if (!VT.isVector()) { MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits()); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In); Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } int NumElts = VT.getVectorNumElements(); int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector()) return Lower256IntUnary(Op, DAG); assert(VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."); // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we // perform the BSWAP in the shuffle. // Its best to shuffle using the second operand as this will implicitly allow // memory folding for multiple vectors. SmallVector MaskElts; for (int i = 0; i != NumElts; ++i) { for (int j = ScalarSizeInBytes - 1; j >= 0; --j) { int SourceByte = 16 + (i * ScalarSizeInBytes) + j; int PermuteByte = SourceByte | (2 << 5); MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8)); } } SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts); SDValue Res = DAG.getBitcast(MVT::v16i8, In); Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8), Res, Mask); return DAG.getBitcast(VT, Res); } static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (Subtarget.hasXOP() && !VT.is512BitVector()) return LowerBITREVERSE_XOP(Op, DAG); assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); SDValue In = Op.getOperand(0); SDLoc DL(Op); unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"); // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); // Perform BITREVERSE using PSHUFB lookups. Each byte is split into // two nibbles and a PSHUFB lookup to find the bitreverse of each // 0-15 value (moved to the other nibble). SDValue NibbleMask = DAG.getConstant(0xF, DL, VT); SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask); SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT)); const int LoLUT[16] = { /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0, /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0, /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0, /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0}; const int HiLUT[16] = { /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C, /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E, /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D, /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F}; SmallVector LoMaskElts, HiMaskElts; for (unsigned i = 0; i < NumElts; ++i) { LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8)); HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8)); } SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts); SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts); Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo); Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi); return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NewOpc = 0; switch (N->getOpcode()) { case ISD::ATOMIC_LOAD_ADD: NewOpc = X86ISD::LADD; break; case ISD::ATOMIC_LOAD_SUB: NewOpc = X86ISD::LSUB; break; case ISD::ATOMIC_LOAD_OR: NewOpc = X86ISD::LOR; break; case ISD::ATOMIC_LOAD_XOR: NewOpc = X86ISD::LXOR; break; case ISD::ATOMIC_LOAD_AND: NewOpc = X86ISD::LAND; break; default: llvm_unreachable("Unknown ATOMIC_LOAD_ opcode"); } MachineMemOperand *MMO = cast(N)->getMemOperand(); return DAG.getMemIntrinsicNode( NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), {N->getOperand(0), N->getOperand(1), N->getOperand(2)}, /*MemVT=*/N->getSimpleValueType(0), MMO); } /// Lower atomic_load_ops into LOCK-prefixed operations. static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { AtomicSDNode *AN = cast(N.getNode()); SDValue Chain = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); unsigned Opc = N->getOpcode(); MVT VT = N->getSimpleValueType(0); SDLoc DL(N); // We can lower atomic_load_add into LXADD. However, any other atomicrmw op // can only be lowered when the result is unused. They should have already // been transformed into a cmpxchg loop in AtomicExpand. if (N->hasAnyUseOfValue(0)) { // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to // select LXADD if LOCK_SUB can't be selected. if (Opc == ISD::ATOMIC_LOAD_SUB) { RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, AN->getMemOperand()); } assert(Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"); return N; } // Specialized lowering for the canonical form of an idemptotent atomicrmw. // The core idea here is that since the memory location isn't actually // changing, all we need is a lowering for the *ordering* impacts of the // atomicrmw. As such, we can chose a different operation and memory // location to minimize impact on other code. if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) { // On X86, the only ordering which actually requires an instruction is // seq_cst which isn't SingleThread, everything just needs to be preserved // during codegen and then dropped. Note that we expect (but don't assume), // that orderings other than seq_cst and acq_rel have been canonicalized to // a store or load. if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && AN->getSyncScopeID() == SyncScope::System) { // Prefer a locked operation against a stack location to minimize cache // traffic. This assumes that stack locations are very likely to be // accessed only by the owning thread. SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT), NewChain); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain); assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT), NewChain); } SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT), LockOp.getValue(1)); } static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { auto *Node = cast(Op.getNode()); SDLoc dl(Node); EVT VT = Node->getMemoryVT(); bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent; bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT); // If this store is not sequentially consistent and the type is legal // we can just keep it. if (!IsSeqCst && IsTypeLegal) return Op; if (VT == MVT::i64 && !IsTypeLegal) { // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled. // FIXME: Use movlps with SSE1. // FIXME: Use fist with X87. bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) { SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getOperand(2)); SDVTList Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); // If this is a sequentially consistent store, also emit an appropriate // barrier. if (IsSeqCst) Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); return Chain; } } // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2), Node->getMemOperand()); return Swap.getValue(1); } static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); MVT VT = N->getSimpleValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDLoc DL(N); // Set the carry flag. SDValue Carry = Op.getOperand(2); EVT CarryVT = Carry.getValueType(); APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, DAG.getConstant(NegOne, DL, CarryVT)); unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB; SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0), Op.getOperand(1), Carry.getValue(1)); SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG); if (N->getValueType(1) == MVT::i1) SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or // { double, double } (which is returned in XMM0, XMM1). SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; const char *LibcallName = TLI.getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) : (Type *)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); std::pair CallResult = TLI.LowerCallTo(CLI); if (isF64) // Returned in xmm0 and xmm1. return CallResult.first; // Returned in bits 0:31 and 32:64 xmm0. SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(0, dl)); SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(1, dl)); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes = false) { // Check if InOp already has the right width. MVT InVT = InOp.getSimpleValueType(); if (InVT == NVT) return InOp; if (InOp.isUndef()) return DAG.getUNDEF(NVT); assert(InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match"); unsigned InNumElts = InVT.getVectorNumElements(); unsigned WidenNumElts = NVT.getVectorNumElements(); assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"); SDLoc dl(InOp); if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { SDValue N1 = InOp.getOperand(1); if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || N1.isUndef()) { InOp = InOp.getOperand(0); InVT = InOp.getSimpleValueType(); InNumElts = InVT.getVectorNumElements(); } } if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { SmallVector Ops; for (unsigned i = 0; i < InNumElts; ++i) Ops.push_back(InOp.getOperand(i)); EVT EltVT = InOp.getOperand(0).getValueType(); SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) Ops.push_back(FillVal); return DAG.getBuildVector(NVT, dl, Ops); } SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedScatterSDNode *N = cast(Op.getNode()); SDValue Src = N->getValue(); MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); SDValue Scale = N->getScale(); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); if (VT == MVT::v2f32) { assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); // If the index is v2i64 and we have VLX we can use xmm for data and index. if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32)); SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); return SDValue(NewScatter.getNode(), 1); } return SDValue(); } if (VT == MVT::v2i32) { assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(MVT::v2i32)); // If the index is v2i64 and we have VLX we can use xmm for data and index. if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); return SDValue(NewScatter.getNode(), 1); } // Custom widen all the operands to avoid promotion. EVT NewIndexVT = EVT::getVectorVT( *DAG.getContext(), Index.getValueType().getVectorElementType(), 4); Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, DAG.getUNDEF(Index.getValueType())); Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, DAG.getConstant(0, dl, MVT::v2i1)); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl, Ops, N->getMemOperand()); } MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); // If the index is v2i32, we're being called by type legalization and we // should just let the default handling take care of it. if (IndexVT == MVT::v2i32) return SDValue(); // If we don't have VLX and neither the passthru or index is 512-bits, we // need to widen until one is. if (!Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // Determine how much we need to widen by to get a 512-bit type. unsigned Factor = std::min(512/VT.getSizeInBits(), 512/IndexVT.getSizeInBits()); unsigned NumElts = VT.getVectorNumElements() * Factor; VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); MaskVT = MVT::getVectorVT(MVT::i1, NumElts); Src = ExtendToType(Src, VT, DAG); Index = ExtendToType(Index, IndexVT, DAG); Mask = ExtendToType(Mask, MaskVT, DAG, true); } SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); return SDValue(NewScatter.getNode(), 1); } static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MaskedLoadSDNode *N = cast(Op.getNode()); MVT VT = Op.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); MVT MaskVT = Mask.getSimpleValueType(); SDValue PassThru = N->getPassThru(); SDLoc dl(Op); // Handle AVX masked loads which don't support passthru other than 0. if (MaskVT.getVectorElementType() != MVT::i1) { // We also allow undef in the isel pattern. if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode())) return Op; SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(), N->getBasePtr(), Mask, getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(), N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad()); // Emit a blend. SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad, PassThru); return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl); } assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"); assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"); assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked load op."); assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."); // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); PassThru = ExtendToType(PassThru, WideDataVT, DAG); // Mask element has to be i1. assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 && "Unexpected mask type"); MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), N->getBasePtr(), Mask, PassThru, N->getMemoryVT(), N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad()); SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), DAG.getIntPtrConstant(0, dl)); SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MaskedStoreSDNode *N = cast(Op.getNode()); SDValue DataToStore = N->getValue(); MVT VT = DataToStore.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); SDLoc dl(Op); assert((!N->isCompressingStore() || Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"); assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"); assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked store op."); assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."); // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); // Mask element has to be i1. assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 && "Unexpected mask type"); MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), N->isTruncatingStore(), N->isCompressingStore()); } static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue PassThru = N->getPassThru(); MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); // If the index is v2i32, we're being called by type legalization. if (IndexVT == MVT::v2i32) return SDValue(); // If we don't have VLX and neither the passthru or index is 512-bits, we // need to widen until one is. MVT OrigVT = VT; if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && !IndexVT.is512BitVector()) { // Determine how much we need to widen by to get a 512-bit type. unsigned Factor = std::min(512/VT.getSizeInBits(), 512/IndexVT.getSizeInBits()); unsigned NumElts = VT.getVectorNumElements() * Factor; VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); MaskVT = MVT::getVectorVT(MVT::i1, NumElts); PassThru = ExtendToType(PassThru, VT, DAG); Index = ExtendToType(Index, IndexVT, DAG); Mask = ExtendToType(Mask, MaskVT, DAG, true); } SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index, N->getScale() }; SDValue NewGather = DAG.getTargetMemSDNode( DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(), N->getMemOperand()); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather, DAG.getIntPtrConstant(0, dl)); return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl); } SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const { // TODO: Eventually, the lowering of these nodes should be informed by or // deferred to the GC strategy for the function in which they appear. For // now, however, they must be lowered to something. Since they are logically // no-ops in the case of a null GC strategy (or a GC strategy which does not // require special handling for these nodes), lower them as literal NOOPs for // the time being. SmallVector Ops; Ops.push_back(Op.getOperand(0)); if (Op->getGluedNode()) Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); SDLoc OpDL(Op); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); return NOOP; } SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const { // TODO: Eventually, the lowering of these nodes should be informed by or // deferred to the GC strategy for the function in which they appear. For // now, however, they must be lowered to something. Since they are logically // no-ops in the case of a null GC strategy (or a GC strategy which does not // require special handling for these nodes), lower them as literal NOOPs for // the time being. SmallVector Ops; Ops.push_back(Op.getOperand(0)); if (Op->getGluedNode()) Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); SDLoc OpDL(Op); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); return NOOP; } /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::SHL_PARTS: case ISD::SRA_PARTS: case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::EH_SJLJ_SETUP_DISPATCH: return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::MULHS: case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); case ISD::ROTL: case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::ADD: case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget); case ISD::UADDSAT: case ISD::SADDSAT: case ISD::USUBSAT: case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, DAG); case ISD::ABS: return LowerABS(Op, Subtarget, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: return LowerGC_TRANSITION_START(Op, DAG); case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); } } /// Places new result values for the node in Results (their number /// and types must exactly match those of the original return values of /// the node), or leaves Results empty, which indicates that the node is not /// to be custom lowered after all. void X86TargetLowering::LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDValue Res = LowerOperation(SDValue(N, 0), DAG); if (!Res.getNode()) return; // If the original node has one result, take the return value from // LowerOperation as is. It might not be result number 0. if (N->getNumValues() == 1) { Results.push_back(Res); return; } // If the original node has multiple results, then the return node should // have the same number of results. assert((N->getNumValues() == Res->getNumValues()) && "Lowering returned the wrong number of results!"); // Places new result values base on N result number. for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) Results.push_back(Res.getValue(I)); } /// Replace a node with an illegal result type with a new node built out of /// custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const { SDLoc dl(N); switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "ReplaceNodeResults: "; N->dump(&DAG); #endif llvm_unreachable("Do not know how to custom type legalize this operation!"); case ISD::CTPOP: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); // Use a v2i64 if possible. bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) { SDValue Wide = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0)); Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide); // Bit count should fit in 32-bits, extract it as that and then zero // extend to i64. Otherwise we end up extracting bits 63:32 separately. Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide); Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide, DAG.getIntPtrConstant(0, dl)); Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide); Results.push_back(Wide); } return; } case ISD::MUL: { EVT VT = N->getValueType(0); assert(VT.isVector() && "Unexpected VT"); if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger && VT.getVectorNumElements() == 2) { // Promote to a pattern that will be turned into PMULUDQ. SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, N->getOperand(0)); SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, N->getOperand(1)); SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1); Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul)); } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8) { // Pre-promote these to vXi16 to avoid op legalization thinking all 16 // elements are needed. MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); unsigned NumConcats = 16 / VT.getVectorNumElements(); SmallVector ConcatOps(NumConcats, DAG.getUNDEF(VT)); ConcatOps[0] = Res; Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); Results.push_back(Res); } return; } case ISD::UADDSAT: case ISD::SADDSAT: case ISD::USUBSAT: case ISD::SSUBSAT: case X86ISD::VPMADDWD: case X86ISD::AVG: { // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and // X86ISD::AVG/VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); EVT InVT = N->getOperand(0).getValueType(); assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."); unsigned NumConcat = 128 / InVT.getSizeInBits(); EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), NumConcat * InVT.getVectorNumElements()); EVT WideVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumConcat * VT.getVectorNumElements()); SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); Ops[0] = N->getOperand(0); SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); Ops[0] = N->getOperand(1); SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1); if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } case ISD::ABS: { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); MVT HalfT = MVT::i32; SDValue Lo, Hi, Tmp; SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), DAG.getConstant(0, dl, HalfT)); Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), DAG.getConstant(1, dl, HalfT)); Tmp = DAG.getNode( ISD::SRA, dl, HalfT, Hi, DAG.getConstant(HalfT.getSizeInBits() - 1, dl, TLI.getShiftAmountTy(HalfT, DAG.getDataLayout()))); Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, SDValue(Lo.getNode(), 1)); Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); Results.push_back(Lo); Results.push_back(Hi); return; } case ISD::SETCC: { // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when // setCC result type is v2i1 because type legalzation will end up with // a v4i1 setcc plus an extend. assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type"); if (N->getOperand(0).getValueType() != MVT::v2f32 || getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector) return; SDValue UNDEF = DAG.getUNDEF(MVT::v2f32); SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), UNDEF); SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(1), UNDEF); SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS, N->getOperand(2)); Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMAX: { EVT VT = N->getValueType(0); assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); SDValue UNDEF = DAG.getUNDEF(VT); SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), UNDEF); SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(1), UNDEF); Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); return; } case ISD::SDIV: case ISD::UDIV: case ISD::SREM: case ISD::UREM: { EVT VT = N->getValueType(0); if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) { // If this RHS is a constant splat vector we can widen this and let // division/remainder by constant optimize it. // TODO: Can we do something for non-splat? APInt SplatVal; if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) { unsigned NumConcats = 128 / VT.getSizeInBits(); SmallVector Ops0(NumConcats, DAG.getUNDEF(VT)); Ops0[0] = N->getOperand(0); EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT); SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0); SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT); SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1); Results.push_back(Res); } return; } if (VT == MVT::v2i32) { // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the // v2i64 and unroll later. But then we create i64 scalar ops which // might be slow in 64-bit mode or require a libcall in 32-bit mode. Results.push_back(DAG.UnrollVectorOp(N)); return; } if (VT.isVector()) return; LLVM_FALLTHROUGH; } case ISD::SDIVREM: case ISD::UDIVREM: { SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); Results.push_back(V); return; } case ISD::TRUNCATE: { MVT VT = N->getSimpleValueType(0); if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) return; // The generic legalizer will try to widen the input type to the same // number of elements as the widened result type. But this isn't always // the best thing so do some custom legalization to avoid some cases. MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT(); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); unsigned InBits = InVT.getSizeInBits(); if (128 % InBits == 0) { // 128 bit and smaller inputs should avoid truncate all together and // just use a build_vector that will become a shuffle. // TODO: Widen and use a shuffle directly? MVT InEltVT = InVT.getSimpleVT().getVectorElementType(); EVT EltVT = VT.getVectorElementType(); unsigned WidenNumElts = WidenVT.getVectorNumElements(); SmallVector Ops(WidenNumElts, DAG.getUNDEF(EltVT)); // Use the original element count so we don't do more scalar opts than // necessary. unsigned MinElts = VT.getVectorNumElements(); for (unsigned i=0; i < MinElts; ++i) { SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In, DAG.getIntPtrConstant(i, dl)); Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val); } Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops)); return; } // With AVX512 there are some cases that can use a target specific // truncate node to go from 256/512 to less than 128 with zeros in the // upper elements of the 128 bit result. if (Subtarget.hasAVX512() && isTypeLegal(InVT)) { // We can use VTRUNC directly if for 256 bits with VLX or for any 512. if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) { Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); return; } // There's one case we can widen to 512 bits and use VTRUNC. if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) { In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In, DAG.getUNDEF(MVT::v4i64)); Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); return; } } return; } case ISD::SIGN_EXTEND_VECTOR_INREG: { if (ExperimentalVectorWideningLegalization) return; EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && (InVT == MVT::v16i16 || InVT == MVT::v32i8)) { // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting // we allow the sra from the extend to i32 to be shared by the split. EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), InVT.getVectorNumElements() / 2); MVT ExtendVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements()); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT, In, DAG.getIntPtrConstant(0, dl)); In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In); // Fill a vector with sign bits for each element. SDValue Zero = DAG.getConstant(0, dl, ExtendVT); SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); // Create an unpackl and unpackh to interleave the sign bits then bitcast // to vXi64. SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits); Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo); SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits); Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); return; } return; } case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && (InVT == MVT::v4i16 || InVT == MVT::v4i8) && getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) { assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"); // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting // we allow the sra from the extend to i32 to be shared by the split. In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In); // Fill a vector with sign bits for each element. SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT); // Create an unpackl and unpackh to interleave the sign bits then bitcast // to v2i64. SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {0, 4, 1, 5}); Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo); SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {2, 6, 3, 7}); Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); return; } if (VT == MVT::v16i32 || VT == MVT::v8i64) { if (!InVT.is128BitVector()) { // Not a 128 bit vector, but maybe type legalization will promote // it to 128 bits. if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger) return; InVT = getTypeToTransformTo(*DAG.getContext(), InVT); if (!InVT.is128BitVector()) return; // Promote the input to 128 bits. Type legalization will turn this into // zext_inreg/sext_inreg. In = DAG.getNode(N->getOpcode(), dl, InVT, In); } // Perform custom splitting instead of the two stage extend we would get // by default. EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); assert(isTypeLegal(LoVT) && "Split VT not legal?"); SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG); // We need to shift the input over by half the number of elements. unsigned NumElts = InVT.getVectorNumElements(); unsigned HalfNumElts = NumElts / 2; SmallVector ShufMask(NumElts, SM_SentinelUndef); for (unsigned i = 0; i != HalfNumElts; ++i) ShufMask[i] = i + HalfNumElts; SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); } return; } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); // Promote these manually to avoid over promotion to v2i64. Type // legalization will revisit the v2i32 operation for more cleanup. if ((VT == MVT::v2i8 || VT == MVT::v2i16) && getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) { // AVX512DQ provides instructions that produce a v2i64 result. if (Subtarget.hasDQI()) return; SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src); Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext : ISD::AssertSext, dl, MVT::v2i32, Res, DAG.getValueType(VT.getVectorElementType())); Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); Results.push_back(Res); return; } if (VT.isVector() && VT.getScalarSizeInBits() < 32) { if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) return; // Try to create a 128 bit vector, but don't exceed a 32 bit element. unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth), VT.getVectorNumElements()); SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); // Preserve what we know about the size of the original result. Except // when the result is v2i32 since we can't widen the assert. if (PromoteVT != MVT::v2i32) Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext : ISD::AssertSext, dl, PromoteVT, Res, DAG.getValueType(VT.getVectorElementType())); // Truncate back to the original width. Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); // Now widen to 128 bits. unsigned NumConcats = 128 / VT.getSizeInBits(); MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(), VT.getVectorNumElements() * NumConcats); SmallVector ConcatOps(NumConcats, DAG.getUNDEF(VT)); ConcatOps[0] = Res; Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); Results.push_back(Res); return; } if (VT == MVT::v2i32) { assert((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); bool Widenv2i32 = getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector; if (Src.getValueType() == MVT::v2f64) { unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; if (!IsSigned && !Subtarget.hasVLX()) { // If v2i32 is widened, we can defer to the generic legalizer. if (Widenv2i32) return; // Custom widen by doubling to a legal vector with. Isel will // further widen to v8f64. Opc = ISD::FP_TO_UINT; Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src, DAG.getUNDEF(MVT::v2f64)); } SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); if (!Widenv2i32) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } if (SrcVT == MVT::v2f32 && getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { SDValue Idx = DAG.getIntPtrConstant(0, dl); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32)); Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs, // so early out here. return; } if (Subtarget.hasDQI() && VT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { assert(!Subtarget.is64Bit() && "i64 should be legal"); unsigned NumElts = Subtarget.hasVLX() ? 4 : 8; // Using a 256-bit input here to guarantee 128-bit input for f32 case. // TODO: Use 128-bit vectors for f64 case? // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI. MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts); MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts); SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT, DAG.getConstantFP(0.0, dl, VecInVT), Src, ZeroIdx); Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx); Results.push_back(Res); return; } if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned)) Results.push_back(V); return; } case ISD::SINT_TO_FP: { assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); SDValue Src = N->getOperand(0); if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64) return; Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src)); return; } case ISD::UINT_TO_FP: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); if (VT != MVT::v2f32) return; SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) { Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src)); return; } if (SrcVT != MVT::v2i32) return; SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src); SDValue VBias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); return; } case ISD::FP_ROUND: { if (!isTypeLegal(N->getOperand(0).getValueType())) return; SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); Results.push_back(V); return; } case ISD::FP_EXTEND: { // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. // No other ValueType for FP_EXTEND should reach this point. assert(N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"); return; } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); case Intrinsic::x86_rdtsc: return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); case Intrinsic::x86_rdtscp: return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget, Results); case Intrinsic::x86_rdpmc: expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget, Results); return; case Intrinsic::x86_xgetbv: expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget, Results); return; } } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; assert((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"); MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(0, dl, HalfT)); cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(1, dl, HalfT)); cpInL = DAG.getCopyToReg(N->getOperand(0), dl, Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue()); cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX, cpInH, cpInL.getValue(1)); SDValue swapInL, swapInH; swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(0, dl, HalfT)); swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), DAG.getConstant(1, dl, HalfT)); swapInH = DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, swapInH, cpInH.getValue(1)); // If the current function needs the base pointer, RBX, // we shouldn't use cmpxchg directly. // Indeed the lowering of that instruction will clobber // that register and since RBX will be a reserved register // the register allocator will not make sure its value will // be properly saved and restored around this live-range. const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); unsigned BasePtr = TRI->getBaseRegister(); MachineMemOperand *MMO = cast(N)->getMemOperand(); if (TRI->hasBasePointer(DAG.getMachineFunction()) && (BasePtr == X86::RBX || BasePtr == X86::EBX)) { // ISel prefers the LCMPXCHG64 variant. // If that assert breaks, that means it is not the case anymore, // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX, // not just EBX. This is a matter of accepting i64 input for that // pseudo, and restoring into the register of the right wide // in expand pseudo. Everything else should just work. assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && "Saving only half of the RBX"); unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG : X86ISD::LCMPXCHG8_SAVE_EBX_DAG; SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl, Regs64bit ? X86::RBX : X86::EBX, HalfT, swapInH.getValue(1)); SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL, RBXSave, /*Glue*/ RBXSave.getValue(2)}; Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); } else { unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG; swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, Regs64bit ? X86::RBX : X86::EBX, swapInL, swapInH.getValue(1)); SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), swapInL.getValue(1)}; Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); } SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, MVT::i32, cpOutH.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG); Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); Results.push_back(Success); Results.push_back(EFLAGS.getValue(1)); return; } case ISD::ATOMIC_LOAD: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { auto *Node = cast(N); if (Subtarget.hasSSE2()) { // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the // lower 64-bits. SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); Results.push_back(Ld.getValue(1)); return; } if (Subtarget.hasX87()) { // First load this into an 80-bit X87 register. This will put the whole // integer into the significand. // FIXME: Do we need to glue? See FIXME comment in BuildFILD. SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue); SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); SDValue Chain = Result.getValue(1); SDValue InFlag = Result.getValue(2); // Now store the X87 register to a stack temporary and convert to i64. // This store is not atomic and doesn't need to be. // FIXME: We don't need a stack temporary if the result of the load // is already being stored. We could just directly store there. SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64); int SPFI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag }; Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64, MPI, 0 /*Align*/, MachineMemOperand::MOStore); // Finally load the value back from the stack temporary and return it. // This load is not atomic and doesn't need to be. // This load will be further type legalized. Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI); Results.push_back(Result); Results.push_back(Result.getValue(1)); return; } } // TODO: Use MOVLPS when SSE1 is available? // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; } case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; case ISD::BITCAST: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); EVT SrcVT = N->getOperand(0).getValueType(); // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target // we can split using the k-register rather than memory. if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) { assert(!Subtarget.is64Bit() && "Expected 32-bit mode"); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); Lo = DAG.getBitcast(MVT::i32, Lo); Hi = DAG.getBitcast(MVT::i32, Hi); SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); Results.push_back(Res); return; } // Custom splitting for BWI types when AVX512F is available but BWI isn't. if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) && SrcVT.isVector() && isTypeLegal(SrcVT)) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8; Lo = DAG.getBitcast(CastVT, Lo); Hi = DAG.getBitcast(CastVT, Hi); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi); Results.push_back(Res); return; } if (SrcVT != MVT::f64 || (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) || getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) return; unsigned NumElts = DstVT.getVectorNumElements(); EVT SVT = DstVT.getVectorElementType(); EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue Res; Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); Res = DAG.getBitcast(WiderVT, Res); Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } case ISD::MGATHER: { EVT VT = N->getValueType(0); if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); if (Index.getValueType() != MVT::v2i64) return; SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Gather->getPassThru(), DAG.getUNDEF(MVT::v2f32)); if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, DAG.getUNDEF(MVT::v2i1)); Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); } SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; SDValue Res = DAG.getTargetMemSDNode( DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl, Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(2)); return; } if (VT == MVT::v2i32) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Gather->getPassThru(), DAG.getUNDEF(MVT::v2i32)); // If the index is v2i64 we can use it directly. if (Index.getValueType() == MVT::v2i64 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, DAG.getUNDEF(MVT::v2i1)); Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); } SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; SDValue Res = DAG.getTargetMemSDNode( DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, Gather->getMemoryVT(), Gather->getMemOperand()); SDValue Chain = Res.getValue(2); if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); Results.push_back(Chain); return; } if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { EVT IndexVT = Index.getValueType(); EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), IndexVT.getScalarType(), 4); // Otherwise we need to custom widen everything to avoid promotion. Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, DAG.getUNDEF(IndexVT)); Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, DAG.getConstant(0, dl, MVT::v2i1)); SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), Gather->getMemoryVT(), dl, Ops, Gather->getMemOperand()); SDValue Chain = Res.getValue(1); if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); Results.push_back(Chain); return; } } return; } case ISD::LOAD: { // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp // cast since type legalization will try to use an i64 load. MVT VT = N->getSimpleValueType(0); assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT"); if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) return; if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast(N); if (Subtarget.hasSSE2()) { MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); MVT WideVT = MVT::getVectorVT(LdVT, 2); Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() * 2); Res = DAG.getBitcast(CastVT, Res); Results.push_back(Res); Results.push_back(Chain); return; } assert(Subtarget.hasSSE1() && "Expected SSE"); SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other); SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64, Ld->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(1)); return; } } } const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((X86ISD::NodeType)Opcode) { case X86ISD::FIRST_NUMBER: break; case X86ISD::BSF: return "X86ISD::BSF"; case X86ISD::BSR: return "X86ISD::BSR"; case X86ISD::SHLD: return "X86ISD::SHLD"; case X86ISD::SHRD: return "X86ISD::SHRD"; case X86ISD::FAND: return "X86ISD::FAND"; case X86ISD::FANDN: return "X86ISD::FANDN"; case X86ISD::FOR: return "X86ISD::FOR"; case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FILD: return "X86ISD::FILD"; case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; case X86ISD::FIST: return "X86ISD::FIST"; case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM"; case X86ISD::FLD: return "X86ISD::FLD"; case X86ISD::FST: return "X86ISD::FST"; case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; case X86ISD::FSETCCM: return "X86ISD::FSETCCM"; case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; case X86ISD::IRET: return "X86ISD::IRET"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::BLENDV: return "X86ISD::BLENDV"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAXS: return "X86ISD::FMAXS"; case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE"; case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FMINS: return "X86ISD::FMINS"; case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE"; case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; case X86ISD::EH_SJLJ_SETUP_DISPATCH: return "X86ISD::EH_SJLJ_SETUP_DISPATCH"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; case X86ISD::LCMPXCHG8_SAVE_EBX_DAG: return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG"; case X86ISD::LCMPXCHG16_SAVE_RBX_DAG: return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG"; case X86ISD::LADD: return "X86ISD::LADD"; case X86ISD::LSUB: return "X86ISD::LSUB"; case X86ISD::LOR: return "X86ISD::LOR"; case X86ISD::LXOR: return "X86ISD::LXOR"; case X86ISD::LAND: return "X86ISD::LAND"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC"; case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS"; case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS"; case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES"; case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE"; case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS"; case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND"; case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS"; case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; case X86ISD::VSRL: return "X86ISD::VSRL"; case X86ISD::VSRA: return "X86ISD::VSRA"; case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; case X86ISD::VSHLV: return "X86ISD::VSHLV"; case X86ISD::VSRLV: return "X86ISD::VSRLV"; case X86ISD::VSRAV: return "X86ISD::VSRAV"; case X86ISD::VROTLI: return "X86ISD::VROTLI"; case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::VPPERM: return "X86ISD::VPPERM"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; case X86ISD::SBB: return "X86ISD::SBB"; case X86ISD::SMUL: return "X86ISD::SMUL"; case X86ISD::UMUL: return "X86ISD::UMUL"; case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::BZHI: return "X86ISD::BZHI"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::KADD: return "X86ISD::KADD"; case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL"; case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::VALIGN: return "X86ISD::VALIGN"; case X86ISD::VSHLD: return "X86ISD::VSHLD"; case X86ISD::VSHRD: return "X86ISD::VSHRD"; case X86ISD::VSHLDV: return "X86ISD::VSHLDV"; case X86ISD::VSHRDV: return "X86ISD::VSHRDV"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; case X86ISD::SHUFP: return "X86ISD::SHUFP"; case X86ISD::SHUF128: return "X86ISD::SHUF128"; case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; case X86ISD::MOVSD: return "X86ISD::MOVSD"; case X86ISD::MOVSS: return "X86ISD::MOVSS"; case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE"; case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE"; case X86ISD::VRANGES: return "X86ISD::VRANGES"; case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::MFENCE: return "X86ISD::MFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; case X86ISD::RDPKRU: return "X86ISD::RDPKRU"; case X86ISD::WRPKRU: return "X86ISD::WRPKRU"; case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; case X86ISD::VPSHA: return "X86ISD::VPSHA"; case X86ISD::VPSHL: return "X86ISD::VPSHL"; case X86ISD::VPCOM: return "X86ISD::VPCOM"; case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND"; case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND"; case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND"; case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE"; case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE"; case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE"; case X86ISD::VREDUCES: return "X86ISD::VREDUCES"; case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE"; case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE"; case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE"; case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR"; case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR"; case X86ISD::XTEST: return "X86ISD::XTEST"; case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; case X86ISD::EXPAND: return "X86ISD::EXPAND"; case X86ISD::SELECTS: return "X86ISD::SELECTS"; case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; case X86ISD::RCP14: return "X86ISD::RCP14"; case X86ISD::RCP14S: return "X86ISD::RCP14S"; case X86ISD::RCP28: return "X86ISD::RCP28"; case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE"; case X86ISD::RCP28S: return "X86ISD::RCP28S"; case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE"; case X86ISD::EXP2: return "X86ISD::EXP2"; case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE"; case X86ISD::RSQRT14: return "X86ISD::RSQRT14"; case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; case X86ISD::FADDS: return "X86ISD::FADDS"; case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; case X86ISD::FSUBS: return "X86ISD::FSUBS"; case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; case X86ISD::FMULS: return "X86ISD::FMULS"; case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; case X86ISD::FDIVS: return "X86ISD::FDIVS"; case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; case X86ISD::FSQRTS: return "X86ISD::FSQRTS"; case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; case X86ISD::FGETEXP: return "X86ISD::FGETEXP"; case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE"; case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS"; case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND"; case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND"; case X86ISD::AVG: return "X86ISD::AVG"; case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI"; case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI"; case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI"; case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI"; case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE"; case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE"; case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI"; case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI"; case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE"; case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE"; case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P"; case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P"; case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P"; case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT"; case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP"; case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND"; case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP"; case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI"; case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI"; case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND"; case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND"; case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI"; case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI"; case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16"; case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16"; case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16"; case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS"; case X86ISD::LWPINS: return "X86ISD::LWPINS"; case X86ISD::MGATHER: return "X86ISD::MGATHER"; case X86ISD::MSCATTER: return "X86ISD::MSCATTER"; case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD"; case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS"; case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD"; case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS"; case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB"; case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB"; case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB"; case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB"; case X86ISD::NT_CALL: return "X86ISD::NT_CALL"; case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND"; case X86ISD::UMWAIT: return "X86ISD::UMWAIT"; case X86ISD::TPAUSE: return "X86ISD::TPAUSE"; case X86ISD::ENQCMD: return "X86ISD:ENQCMD"; case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS"; case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT"; } return nullptr; } /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) return false; if (AM.BaseGV) { unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV); // If a reference to this global requires an extra load, we can't fold it. if (isGlobalStubReference(GVFlags)) return false; // If BaseGV requires a register for the PIC base, we cannot also have a // BaseReg specified. if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) return false; // If lower 4G is not available, then we must use rip-relative addressing. if ((M != CodeModel::Small || isPositionIndependent()) && Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1)) return false; } switch (AM.Scale) { case 0: case 1: case 2: case 4: case 8: // These scales always work. break; case 3: case 5: case 9: // These scales are formed with basereg+scalereg. Only accept if there is // no basereg yet. if (AM.HasBaseReg) return false; break; default: // Other stuff never works. return false; } return true; } bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { unsigned Bits = Ty->getScalarSizeInBits(); // 8-bit shifts are always expensive, but versions with a scalar amount aren't // particularly cheaper than those without. if (Bits == 8) return false; // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) return false; // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable // shifts just as cheap as scalar ones. if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64)) return false; // AVX512BW has shifts such as vpsllvw. if (Subtarget.hasBWI() && Bits == 16) return false; // Otherwise, it's significantly cheaper to shift by a scalar amount than by a // fully general vector. return true; } bool X86TargetLowering::isBinOp(unsigned Opcode) const { switch (Opcode) { // These are non-commutative binops. // TODO: Add more X86ISD opcodes once we have test coverage. case X86ISD::ANDNP: case X86ISD::PCMPGT: case X86ISD::FMAX: case X86ISD::FMIN: case X86ISD::FANDN: return true; } return TargetLoweringBase::isBinOp(Opcode); } bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { switch (Opcode) { // TODO: Add more X86ISD opcodes once we have test coverage. case X86ISD::PCMPEQ: case X86ISD::PMULDQ: case X86ISD::PMULUDQ: case X86ISD::FMAXC: case X86ISD::FMINC: case X86ISD::FAND: case X86ISD::FOR: case X86ISD::FXOR: return true; } return TargetLoweringBase::isCommutativeBinOp(Opcode); } bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; if (!isTypeLegal(EVT::getEVT(Ty1))) return false; assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); // Assuming the caller doesn't have a zeroext or signext return parameter, // truncation all the way down to i1 is valid. return true; } bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<32>(Imm); } bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { // Can also use sub to handle negated immediates. return isInt<32>(Imm); } bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const { return isInt<32>(Imm); } bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (!VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit(); } bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit(); } bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { EVT VT1 = Val.getValueType(); if (isZExtFree(VT1, VT2)) return true; if (Val.getOpcode() != ISD::LOAD) return false; if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() || !VT2.isInteger()) return false; switch (VT1.getSimpleVT().SimpleTy) { default: break; case MVT::i8: case MVT::i16: case MVT::i32: // X86 has 8, 16, and 32-bit zero-extending loads. return true; } return false; } bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT SrcVT = ExtVal.getOperand(0).getValueType(); // There is no extending load for vXi1. if (SrcVT.getScalarType() == MVT::i1) return false; return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { if (!Subtarget.hasAnyFMA()) return false; VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; default: break; } return false; } bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(VT1 == MVT::i32 && VT2 == MVT::i16); } /// Targets can use this to indicate that they only support *some* /// VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. bool X86TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { if (!VT.isSimple()) return false; // Not for i1 vectors if (VT.getSimpleVT().getScalarType() == MVT::i1) return false; // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSimpleVT().getSizeInBits() == 64) return false; // We only care that the types being shuffled are legal. The lowering can // handle any possible shuffle mask that results. return isTypeLegal(VT.getSimpleVT()); } bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef Mask, EVT VT) const { // Don't convert an 'and' into a shuffle that we don't directly support. // vpblendw and vpshufb for 256-bit vectors are not available on AVX1. if (!Subtarget.hasAVX2()) if (VT == MVT::v32i8 || VT == MVT::v16i16) return false; // Just delegate to the generic legality, clear masks aren't special. return isShuffleMaskLegal(Mask, VT); } bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { // If the subtarget is using retpolines, we need to not generate jump tables. if (Subtarget.useRetpolineIndirectBranches()) return false; // Otherwise, fallback on the generic logic. return TargetLowering::areJTsAllowed(Fn); } //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { DebugLoc DL = MI.getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // For the v = xbegin(), we generate // // thisMBB: // xbegin sinkMBB // // mainMBB: // s0 = -1 // // fallBB: // eax = # XABORT_DEF // s1 = eax // // sinkMBB: // v = phi(s0/mainBB, s1/fallBB) MachineBasicBlock *thisMBB = MBB; MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, fallMBB); MF->insert(I, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned fallDstReg = MRI.createVirtualRegister(RC); // thisMBB: // xbegin fallMBB // # fallthrough to mainMBB // # abortion to fallMBB BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(fallMBB); // mainMBB: // mainDstReg := -1 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1); BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); mainMBB->addSuccessor(sinkMBB); // fallMBB: // ; pseudo instruction to model hardware's definition from XABORT // EAX := XABORT_DEF // fallDstReg := EAX BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF)); BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg) .addReg(X86::EAX); fallMBB->addSuccessor(sinkMBB); // sinkMBB: // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB) BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) .addReg(mainDstReg).addMBB(mainMBB) .addReg(fallDstReg).addMBB(fallMBB); MI.eraseFromParent(); return sinkMBB; } MachineBasicBlock * X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. // Operands to this pseudo-instruction: // 0 ) Output : destination address (reg) // 1-5) Input : va_list address (addr, i64mem) // 6 ) ArgSize : Size (in bytes) of vararg type // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset // 8 ) Align : Alignment of type // 9 ) EFLAGS (implicit-def) assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); static_assert(X86::AddrNumOperands == 5, "VAARG_64 assumes 5 address operands"); unsigned DestReg = MI.getOperand(0).getReg(); MachineOperand &Base = MI.getOperand(1); MachineOperand &Scale = MI.getOperand(2); MachineOperand &Index = MI.getOperand(3); MachineOperand &Disp = MI.getOperand(4); MachineOperand &Segment = MI.getOperand(5); unsigned ArgSize = MI.getOperand(6).getImm(); unsigned ArgMode = MI.getOperand(7).getImm(); unsigned Align = MI.getOperand(8).getImm(); MachineFunction *MF = MBB->getParent(); // Memory Reference assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); MachineMemOperand *OldMMO = MI.memoperands().front(); // Clone the MMO into two separate MMOs for loading and storing MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand( OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore); MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand( OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad); // Machine Information const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); DebugLoc DL = MI.getDebugLoc(); // struct va_list { // i32 gp_offset // i32 fp_offset // i64 overflow_area (address) // i64 reg_save_area (address) // } // sizeof(va_list) = 24 // alignment(va_list) = 8 unsigned TotalNumIntRegs = 6; unsigned TotalNumXMMRegs = 8; bool UseGPOffset = (ArgMode == 1); bool UseFPOffset = (ArgMode == 2); unsigned MaxOffset = TotalNumIntRegs * 8 + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); /* Align ArgSize to a multiple of 8 */ unsigned ArgSizeA8 = (ArgSize + 7) & ~7; bool NeedsAlign = (Align > 8); MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *overflowMBB; MachineBasicBlock *offsetMBB; MachineBasicBlock *endMBB; unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB unsigned OffsetReg = 0; if (!UseGPOffset && !UseFPOffset) { // If we only pull from the overflow region, we don't create a branch. // We don't need to alter control flow. OffsetDestReg = 0; // unused OverflowDestReg = DestReg; offsetMBB = nullptr; overflowMBB = thisMBB; endMBB = thisMBB; } else { // First emit code to check if gp_offset (or fp_offset) is below the bound. // If so, pull the argument from reg_save_area. (branch to offsetMBB) // If not, pull from overflow_area. (branch to overflowMBB) // // thisMBB // | . // | . // offsetMBB overflowMBB // | . // | . // endMBB // Registers for the PHI in endMBB OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator MBBIter = ++MBB->getIterator(); // Insert the new basic blocks MF->insert(MBBIter, offsetMBB); MF->insert(MBBIter, overflowMBB); MF->insert(MBBIter, endMBB); // Transfer the remainder of MBB and its successor edges to endMBB. endMBB->splice(endMBB->begin(), thisMBB, std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); // Make offsetMBB and overflowMBB successors of thisMBB thisMBB->addSuccessor(offsetMBB); thisMBB->addSuccessor(overflowMBB); // endMBB is a successor of both offsetMBB and overflowMBB offsetMBB->addSuccessor(endMBB); overflowMBB->addSuccessor(endMBB); // Load the offset value into a register OffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) .setMemRefs(LoadOnlyMMO); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) .addReg(OffsetReg) .addImm(MaxOffset + 8 - ArgSizeA8); // Branch to "overflowMBB" if offset >= max // Fall through to "offsetMBB" otherwise BuildMI(thisMBB, DL, TII->get(X86::JCC_1)) .addMBB(overflowMBB).addImm(X86::COND_AE); } // In offsetMBB, emit code to use the reg_save_area. if (offsetMBB) { assert(OffsetReg != 0); // Read the reg_save_area address. unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, 16) .add(Segment) .setMemRefs(LoadOnlyMMO); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) .addImm(0) .addReg(OffsetReg) .addImm(X86::sub_32bit); // Add the offset to the reg_save_area to get the final address. BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) .addReg(OffsetReg64) .addReg(RegSaveReg); // Compute the offset for the next argument unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) .addReg(OffsetReg) .addImm(UseFPOffset ? 16 : 8); // Store it back into the va_list. BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) .addReg(NextOffsetReg) .setMemRefs(StoreOnlyMMO); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) .addMBB(endMBB); } // // Emit code to use overflow area // // Load the overflow_area address into a register. unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, 8) .add(Segment) .setMemRefs(LoadOnlyMMO); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. if (NeedsAlign) { // Align the overflow address assert(isPowerOf2_32(Align) && "Alignment must be a power of 2"); unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) .addReg(OverflowAddrReg) .addImm(Align-1); BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) .addReg(TmpReg) .addImm(~(uint64_t)(Align-1)); } else { BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) .addReg(OverflowAddrReg); } // Compute the next overflow address after this argument. // (the overflow address should be kept 8-byte aligned) unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) .addReg(OverflowDestReg) .addImm(ArgSizeA8); // Store the new overflow address. BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, 8) .add(Segment) .addReg(NextAddrReg) .setMemRefs(StoreOnlyMMO); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { BuildMI(*endMBB, endMBB->begin(), DL, TII->get(X86::PHI), DestReg) .addReg(OffsetDestReg).addMBB(offsetMBB) .addReg(OverflowDestReg).addMBB(overflowMBB); } // Erase the pseudo instruction MI.eraseFromParent(); return endMBB; } MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MachineInstr &MI, MachineBasicBlock *MBB) const { // Emit code to save XMM registers to the stack. The ABI says that the // number of registers to save is given in %al, so it's theoretically // possible to do an indirect jump trick to avoid saving all of them, // however this code takes a simpler approach and just executes all // of the stores if %al is non-zero. It's less code, and it's probably // easier on the hardware branch predictor, and stores aren't all that // expensive anyway. // Create the new basic blocks. One block contains all the XMM stores, // and one block is the final destination regardless of whether any // stores were performed. const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *F = MBB->getParent(); MachineFunction::iterator MBBIter = ++MBB->getIterator(); MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, XMMSaveMBB); F->insert(MBBIter, EndMBB); // Transfer the remainder of MBB and its successor edges to EndMBB. EndMBB->splice(EndMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); EndMBB->transferSuccessorsAndUpdatePHIs(MBB); // The original block will now fall through to the XMM save block. MBB->addSuccessor(XMMSaveMBB); // The XMMSaveMBB will fall through to the end block. XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); unsigned CountReg = MI.getOperand(0).getReg(); int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E); MBB->addSuccessor(EndMBB); } // Make sure the last operand is EFLAGS, which gets clobbered by the branch // that was just emitted, but clearly shouldn't be "saved". assert((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && "Expected last argument to be EFLAGS"); unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; MachineMemOperand *MMO = F->getMachineMemOperand( MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) .addFrameIndex(RegSaveFrameIndex) .addImm(/*Scale=*/1) .addReg(/*IndexReg=*/0) .addImm(/*Disp=*/Offset) .addReg(/*Segment=*/0) .addReg(MI.getOperand(i).getReg()) .addMemOperand(MMO); } MI.eraseFromParent(); // The pseudo instruction is gone now. return EndMBB; } // The EFLAGS operand of SelectItr might be missing a kill marker // because there were multiple uses of EFLAGS, and ISel didn't know // which to mark. Figure out whether SelectItr should have had a // kill marker, and set it if it should. Returns the correct kill // marker value. static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { // Scan forward through BB for a use/def of EFLAGS. MachineBasicBlock::iterator miI(std::next(SelectItr)); for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { const MachineInstr& mi = *miI; if (mi.readsRegister(X86::EFLAGS)) return false; if (mi.definesRegister(X86::EFLAGS)) break; // Should have kill-flag - update below. } // If we hit the end of the block, check whether EFLAGS is live into a // successor. if (miI == BB->end()) { for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), sEnd = BB->succ_end(); sItr != sEnd; ++sItr) { MachineBasicBlock* succ = *sItr; if (succ->isLiveIn(X86::EFLAGS)) return false; } } // We found a def, or hit the end of the basic block and EFLAGS wasn't live // out. SelectMI should have a kill flag on EFLAGS. SelectItr->addRegisterKilled(X86::EFLAGS, TRI); return true; } // Return true if it is OK for this CMOV pseudo-opcode to be cascaded // together with other CMOV pseudo-opcodes into a single basic-block with // conditional jump around it. static bool isCMOVPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { case X86::CMOV_FR32: case X86::CMOV_FR64: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: case X86::CMOV_VR128: case X86::CMOV_VR128X: case X86::CMOV_VR256: case X86::CMOV_VR256X: case X86::CMOV_VR512: case X86::CMOV_VK2: case X86::CMOV_VK4: case X86::CMOV_VK8: case X86::CMOV_VK16: case X86::CMOV_VK32: case X86::CMOV_VK64: return true; default: return false; } } // Helper function, which inserts PHI functions into SinkMBB: // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ], // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for // the last PHI function inserted. static MachineInstrBuilder createPHIsForCMOVsInSinkBB( MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB) { MachineFunction *MF = TrueMBB->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); DebugLoc DL = MIItBegin->getDebugLoc(); X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); // As we are creating the PHIs, we have to be careful if there is more than // one. Later CMOVs may reference the results of earlier CMOVs, but later // PHIs have to reference the individual true/false inputs from earlier PHIs. // That also means that PHI construction must work forward from earlier to // later, and that the code must maintain a mapping from earlier PHI's // destination registers, and the registers that went into the PHI. DenseMap> RegRewriteTable; MachineInstrBuilder MIB; for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { unsigned DestReg = MIIt->getOperand(0).getReg(); unsigned Op1Reg = MIIt->getOperand(1).getReg(); unsigned Op2Reg = MIIt->getOperand(2).getReg(); // If this CMOV we are generating is the opposite condition from // the jump we generated, then we have to swap the operands for the // PHI that is going to be generated. if (MIIt->getOperand(3).getImm() == OppCC) std::swap(Op1Reg, Op2Reg); if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) Op1Reg = RegRewriteTable[Op1Reg].first; if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) Op2Reg = RegRewriteTable[Op2Reg].second; MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) .addReg(Op1Reg) .addMBB(FalseMBB) .addReg(Op2Reg) .addMBB(TrueMBB); // Add this PHI to the rewrite table. RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); } return MIB; } // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2). MachineBasicBlock * X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, MachineInstr &SecondCascadedCMOV, MachineBasicBlock *ThisMBB) const { const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = FirstCMOV.getDebugLoc(); // We lower cascaded CMOVs such as // // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2) // // to two successive branches. // // Without this, we would add a PHI between the two jumps, which ends up // creating a few copies all around. For instance, for // // (sitofp (zext (fcmp une))) // // we would generate: // // ucomiss %xmm1, %xmm0 // movss <1.0f>, %xmm0 // movaps %xmm0, %xmm1 // jne .LBB5_2 // xorps %xmm1, %xmm1 // .LBB5_2: // jp .LBB5_4 // movaps %xmm1, %xmm0 // .LBB5_4: // retq // // because this custom-inserter would have generated: // // A // | \ // | B // | / // C // | \ // | D // | / // E // // A: X = ...; Y = ... // B: empty // C: Z = PHI [X, A], [Y, B] // D: empty // E: PHI [X, C], [Z, D] // // If we lower both CMOVs in a single step, we can instead generate: // // A // | \ // | C // | /| // |/ | // | | // | D // | / // E // // A: X = ...; Y = ... // D: empty // E: PHI [X, A], [X, C], [Y, D] // // Which, in our sitofp/fcmp example, gives us something like: // // ucomiss %xmm1, %xmm0 // movss <1.0f>, %xmm0 // jne .LBB5_4 // jp .LBB5_4 // xorps %xmm0, %xmm0 // .LBB5_4: // retq // // We lower cascaded CMOV into two successive branches to the same block. // EFLAGS is used by both, so mark it as live in the second. const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); MachineFunction *F = ThisMBB->getParent(); MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator It = ++ThisMBB->getIterator(); F->insert(It, FirstInsertedMBB); F->insert(It, SecondInsertedMBB); F->insert(It, SinkMBB); // For a cascaded CMOV, we lower it to two successive branches to // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in // the FirstInsertedMBB. FirstInsertedMBB->addLiveIn(X86::EFLAGS); // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) { SecondInsertedMBB->addLiveIn(X86::EFLAGS); SinkMBB->addLiveIn(X86::EFLAGS); } // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. SinkMBB->splice(SinkMBB->begin(), ThisMBB, std::next(MachineBasicBlock::iterator(FirstCMOV)), ThisMBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); // Fallthrough block for ThisMBB. ThisMBB->addSuccessor(FirstInsertedMBB); // The true block target of the first branch is always SinkMBB. ThisMBB->addSuccessor(SinkMBB); // Fallthrough block for FirstInsertedMBB. FirstInsertedMBB->addSuccessor(SecondInsertedMBB); // The true block for the branch of FirstInsertedMBB. FirstInsertedMBB->addSuccessor(SinkMBB); // This is fallthrough. SecondInsertedMBB->addSuccessor(SinkMBB); // Create the conditional branch instructions. X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm()); BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC); X86::CondCode SecondCC = X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm()); BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC); // SinkMBB: // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] unsigned DestReg = FirstCMOV.getOperand(0).getReg(); unsigned Op1Reg = FirstCMOV.getOperand(1).getReg(); unsigned Op2Reg = FirstCMOV.getOperand(2).getReg(); MachineInstrBuilder MIB = BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg) .addReg(Op1Reg) .addMBB(SecondInsertedMBB) .addReg(Op2Reg) .addMBB(ThisMBB); // The second SecondInsertedMBB provides the same incoming value as the // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes). MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB); // Copy the PHI result to the register defined by the second CMOV. BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL, TII->get(TargetOpcode::COPY), SecondCascadedCMOV.getOperand(0).getReg()) .addReg(FirstCMOV.getOperand(0).getReg()); // Now remove the CMOVs. FirstCMOV.eraseFromParent(); SecondCascadedCMOV.eraseFromParent(); return SinkMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, MachineBasicBlock *ThisMBB) const { const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the // true/false values to select between and a branch opcode to use. // ThisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> FalseMBB // This code lowers all pseudo-CMOV instructions. Generally it lowers these // as described above, by inserting a BB, and then making a PHI at the join // point to select the true and false operands of the CMOV in the PHI. // // The code also handles two different cases of multiple CMOV opcodes // in a row. // // Case 1: // In this case, there are multiple CMOVs in a row, all which are based on // the same condition setting (or the exact opposite condition setting). // In this case we can lower all the CMOVs using a single inserted BB, and // then make a number of PHIs at the join point to model the CMOVs. The only // trickiness here, is that in a case like: // // t2 = CMOV cond1 t1, f1 // t3 = CMOV cond1 t2, f2 // // when rewriting this into PHIs, we have to perform some renaming on the // temps since you cannot have a PHI operand refer to a PHI result earlier // in the same block. The "simple" but wrong lowering would be: // // t2 = PHI t1(BB1), f1(BB2) // t3 = PHI t2(BB1), f2(BB2) // // but clearly t2 is not defined in BB1, so that is incorrect. The proper // renaming is to note that on the path through BB1, t2 is really just a // copy of t1, and do that renaming, properly generating: // // t2 = PHI t1(BB1), f1(BB2) // t3 = PHI t1(BB1), f2(BB2) // // Case 2: // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate // function - EmitLoweredCascadedSelect. X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineInstr *LastCMOV = &MI; MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI); // Check for case 1, where there are multiple CMOVs with the same condition // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the // number of jumps the most. if (isCMOVPseudo(MI)) { // See if we have a string of CMOVS with the same condition. Skip over // intervening debug insts. while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) && (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; ++NextMIIt; NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end()); } } // This checks for case 2, but only do this if we didn't already find // case 1, as indicated by LastCMOV == MI. if (LastCMOV == &MI && NextMIIt != ThisMBB->end() && NextMIIt->getOpcode() == MI.getOpcode() && NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() && NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() && NextMIIt->getOperand(1).isKill()) { return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB); } const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); MachineFunction *F = ThisMBB->getParent(); MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator It = ++ThisMBB->getIterator(); F->insert(It, FalseMBB); F->insert(It, SinkMBB); // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); if (!LastCMOV->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) { FalseMBB->addLiveIn(X86::EFLAGS); SinkMBB->addLiveIn(X86::EFLAGS); } // Transfer any debug instructions inside the CMOV sequence to the sunk block. auto DbgEnd = MachineBasicBlock::iterator(LastCMOV); auto DbgIt = MachineBasicBlock::iterator(MI); while (DbgIt != DbgEnd) { auto Next = std::next(DbgIt); if (DbgIt->isDebugInstr()) SinkMBB->push_back(DbgIt->removeFromParent()); DbgIt = Next; } // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. SinkMBB->splice(SinkMBB->end(), ThisMBB, std::next(MachineBasicBlock::iterator(LastCMOV)), ThisMBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); // Fallthrough block for ThisMBB. ThisMBB->addSuccessor(FalseMBB); // The true block target of the first (or only) branch is always a SinkMBB. ThisMBB->addSuccessor(SinkMBB); // Fallthrough block for FalseMBB. FalseMBB->addSuccessor(SinkMBB); // Create the conditional branch instruction. BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC); // SinkMBB: // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ] // ... MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); MachineBasicBlock::iterator MIItEnd = std::next(MachineBasicBlock::iterator(LastCMOV)); createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB); // Now remove the CMOV(s). ThisMBB->erase(MIItBegin, MIItEnd); return SinkMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); const bool Is64Bit = Subtarget.is64Bit(); const bool IsLP64 = Subtarget.isTarget64BitLP64(); const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; // BB: // ... [Till the alloca] // If stacklet is not large enough, jump to mallocMBB // // bumpMBB: // Allocate by subtracting from RSP // Jump to continueMBB // // mallocMBB: // Allocate by call to runtime // // continueMBB: // ... // [rest of original BB] // MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(getPointerTy(MF->getDataLayout())); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI.getOperand(1).getReg(), physSPReg = IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = ++BB->getIterator(); MF->insert(MBBIter, bumpMBB); MF->insert(MBBIter, mallocMBB); MF->insert(MBBIter, continueMBB); continueMBB->splice(continueMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); continueMBB->transferSuccessorsAndUpdatePHIs(BB); // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); } else if (Is64Bit) { BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EDI, RegState::Implicit) .addReg(X86::EAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EAX, RegState::ImplicitDefine); } if (!Is64Bit) BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) .addImm(16); BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) .addReg(IsLP64 ? X86::RAX : X86::EAX); BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Set up the CFG correctly. BB->addSuccessor(bumpMBB); BB->addSuccessor(mallocMBB); mallocMBB->addSuccessor(continueMBB); bumpMBB->addSuccessor(continueMBB); // Take care of the PHI nodes. BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), MI.getOperand(0).getReg()) .addReg(mallocPtrVReg) .addMBB(mallocMBB) .addReg(bumpSPPtrVReg) .addMBB(bumpMBB); // Delete the original pseudo instruction. MI.eraseFromParent(); // And we're done. return continueMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); DebugLoc DL = MI.getDebugLoc(); assert(!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"); // Only 32-bit EH needs to worry about manually restoring stack pointers. if (!Subtarget.is32Bit()) return BB; // C++ EH creates a new target block to hold the restore code, and wires up // the new block to the return destination with a normal JMP_4. MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); assert(BB->succ_size() == 1); MF->insert(std::next(BB->getIterator()), RestoreMBB); RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RestoreMBB); MI.getOperand(0).setMBB(RestoreMBB); auto RestoreMBBI = RestoreMBB->begin(); BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const Constant *PerFn = MF->getFunction().getPersonalityFn(); bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); // Only 32-bit SEH requires special handling for catchpad. if (IsSEH && Subtarget.is32Bit()) { const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); } MI.eraseFromParent(); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, MachineBasicBlock *BB) const { // So, here we replace TLSADDR with the sequence: // adjust_stackdown -> TLSADDR -> adjust_stackup. // We need this because TLSADDR is lowered into calls // inside MC, therefore without the two markers shrink-wrapping // may push the prologue/epilogue pass them. const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); MachineFunction &MF = *BB->getParent(); // Emit CALLSEQ_START right before the instruction. unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); MachineInstrBuilder CallseqStart = BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); // Emit CALLSEQ_END right after the instruction. // We don't call erase from parent because we want to keep the // original instruction around. unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); MachineInstrBuilder CallseqEnd = BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0); BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const { // This is pretty easy. We're taking the value that we received from // our load from the relocation, sticking it in either RDI (x86-64) // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?"); assert(MI.getOperand(3).isGlobal() && "This should be a global"); // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = Subtarget.is64Bit() ? Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() : Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget.is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) .addReg(X86::RIP) .addImm(0) .addReg(0) .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, MI.getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); } else if (!isPositionIndependent()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(0) .addImm(0) .addReg(0) .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, MI.getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } else { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) .addReg(TII->getGlobalBaseReg(F)) .addImm(0) .addReg(0) .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, MI.getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } static unsigned getOpcodeForRetpoline(unsigned RPOpc) { switch (RPOpc) { case X86::RETPOLINE_CALL32: return X86::CALLpcrel32; case X86::RETPOLINE_CALL64: return X86::CALL64pcrel32; case X86::RETPOLINE_TCRETURN32: return X86::TCRETURNdi; case X86::RETPOLINE_TCRETURN64: return X86::TCRETURNdi64; } llvm_unreachable("not retpoline opcode"); } static const char *getRetpolineSymbol(const X86Subtarget &Subtarget, unsigned Reg) { if (Subtarget.useRetpolineExternalThunk()) { // When using an external thunk for retpolines, we pick names that match the // names GCC happens to use as well. This helps simplify the implementation // of the thunks for kernels where they have no easy ability to create // aliases and are doing non-trivial configuration of the thunk's body. For // example, the Linux kernel will do boot-time hot patching of the thunk // bodies and cannot easily export aliases of these to loaded modules. // // Note that at any point in the future, we may need to change the semantics // of how we implement retpolines and at that time will likely change the // name of the called thunk. Essentially, there is no hard guarantee that // LLVM will generate calls to specific thunks, we merely make a best-effort // attempt to help out kernels and other systems where duplicating the // thunks is costly. switch (Reg) { case X86::EAX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__x86_indirect_thunk_eax"; case X86::ECX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__x86_indirect_thunk_ecx"; case X86::EDX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__x86_indirect_thunk_edx"; case X86::EDI: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__x86_indirect_thunk_edi"; case X86::R11: assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); return "__x86_indirect_thunk_r11"; } llvm_unreachable("unexpected reg for retpoline"); } // When targeting an internal COMDAT thunk use an LLVM-specific name. switch (Reg) { case X86::EAX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__llvm_retpoline_eax"; case X86::ECX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__llvm_retpoline_ecx"; case X86::EDX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__llvm_retpoline_edx"; case X86::EDI: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__llvm_retpoline_edi"; case X86::R11: assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); return "__llvm_retpoline_r11"; } llvm_unreachable("unexpected reg for retpoline"); } MachineBasicBlock * X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, MachineBasicBlock *BB) const { // Copy the virtual register into the R11 physical register and // call the retpoline thunk. DebugLoc DL = MI.getDebugLoc(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); unsigned CalleeVReg = MI.getOperand(0).getReg(); unsigned Opc = getOpcodeForRetpoline(MI.getOpcode()); // Find an available scratch register to hold the callee. On 64-bit, we can // just use R11, but we scan for uses anyway to ensure we don't generate // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't // already a register use operand to the call to hold the callee. If none // are available, use EDI instead. EDI is chosen because EBX is the PIC base // register and ESI is the base pointer to realigned stack frames with VLAs. SmallVector AvailableRegs; if (Subtarget.is64Bit()) AvailableRegs.push_back(X86::R11); else AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI}); // Zero out any registers that are already used. for (const auto &MO : MI.operands()) { if (MO.isReg() && MO.isUse()) for (unsigned &Reg : AvailableRegs) if (Reg == MO.getReg()) Reg = 0; } // Choose the first remaining non-zero available register. unsigned AvailableReg = 0; for (unsigned MaybeReg : AvailableRegs) { if (MaybeReg) { AvailableReg = MaybeReg; break; } } if (!AvailableReg) report_fatal_error("calling convention incompatible with retpoline, no " "available registers"); const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg); BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) .addReg(CalleeVReg); MI.getOperand(0).ChangeToES(Symbol); MI.setDesc(TII->get(Opc)); MachineInstrBuilder(*BB->getParent(), &MI) .addReg(AvailableReg, RegState::Implicit | RegState::Kill); return BB; } /// SetJmp implies future control flow change upon calling the corresponding /// LongJmp. /// Instead of using the 'return' instruction, the long jump fixes the stack and /// performs an indirect branch. To do so it uses the registers that were stored /// in the jump buffer (when calling SetJmp). /// In case the shadow stack is enabled we need to fix it as well, because some /// return addresses will be skipped. /// The function will save the SSP for future fixing in the function /// emitLongJmpShadowStackFix. /// \sa emitLongJmpShadowStackFix /// \param [in] MI The temporary Machine Instruction for the builtin. /// \param [in] MBB The Machine Basic Block that will be modified. void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineInstrBuilder MIB; // Memory Reference. SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); // Initialize a register with zero. MVT PVT = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *PtrRC = getRegClassFor(PVT); unsigned ZReg = MRI.createVirtualRegister(PtrRC); unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr; BuildMI(*MBB, MI, DL, TII->get(XorRROpc)) .addDef(ZReg) .addReg(ZReg, RegState::Undef) .addReg(ZReg, RegState::Undef); // Read the current SSP Register value to the zeroed register. unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC); unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); // Write the SSP register value to offset 3 in input memory buffer. unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc)); const int64_t SSPOffset = 3 * PVT.getStoreSize(); const unsigned MemOpndSlot = 1; for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset); else MIB.add(MI.getOperand(MemOpndSlot + i)); } MIB.addReg(SSPCopyReg); MIB.setMemRefs(MMOs); } MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); unsigned DstReg; unsigned MemOpndSlot = 0; unsigned CurOp = 0; DstReg = MI.getOperand(CurOp++).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); (void)TRI; unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); MemOpndSlot = CurOp; MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); // For v = setjmp(buf), we generate // // thisMBB: // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB // SjLjSetup restoreMBB // // mainMBB: // v_main = 0 // // sinkMBB: // v = phi(main, restore) // // restoreMBB: // if base pointer being used, load it from frame // v_restore = 1 MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MF->push_back(restoreMBB); restoreMBB->setHasAddressTaken(); MachineInstrBuilder MIB; // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: unsigned PtrStoreOpc = 0; unsigned LabelReg = 0; const int64_t LabelOffset = 1 * PVT.getStoreSize(); bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && !isPositionIndependent(); // Prepare IP either in reg or imm. if (!UseImmLabel) { PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; const TargetRegisterClass *PtrRC = getRegClassFor(PVT); LabelReg = MRI.createVirtualRegister(PtrRC); if (Subtarget.is64Bit()) { MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) .addReg(X86::RIP) .addImm(0) .addReg(0) .addMBB(restoreMBB) .addReg(0); } else { const X86InstrInfo *XII = static_cast(TII); MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) .addReg(XII->getGlobalBaseReg(MF)) .addImm(0) .addReg(0) .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference()) .addReg(0); } } else PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; // Store IP MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset); else MIB.add(MI.getOperand(MemOpndSlot + i)); } if (!UseImmLabel) MIB.addReg(LabelReg); else MIB.addMBB(restoreMBB); MIB.setMemRefs(MMOs); if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) { emitSetJmpShadowStackFix(MI, thisMBB); } // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); // mainMBB: // EAX = 0 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); mainMBB->addSuccessor(sinkMBB); // sinkMBB: BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) .addReg(mainDstReg).addMBB(mainMBB) .addReg(restoreDstReg).addMBB(restoreMBB); // restoreMBB: if (RegInfo->hasBasePointer(*MF)) { const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); X86MachineFunctionInfo *X86FI = MF->getInfo(); X86FI->setRestoreBasePointer(MF); unsigned FramePtr = RegInfo->getFrameRegister(*MF); unsigned BasePtr = RegInfo->getBaseRegister(); unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .setMIFlag(MachineInstr::FrameSetup); } BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); restoreMBB->addSuccessor(sinkMBB); MI.eraseFromParent(); return sinkMBB; } /// Fix the shadow stack using the previously saved SSP pointer. /// \sa emitSetJmpShadowStackFix /// \param [in] MI The temporary Machine Instruction for the builtin. /// \param [in] MBB The Machine Basic Block that will be modified. /// \return The sink MBB that will perform the future indirect branch. MachineBasicBlock * X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); MVT PVT = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *PtrRC = getRegClassFor(PVT); // checkSspMBB: // xor vreg1, vreg1 // rdssp vreg1 // test vreg1, vreg1 // je sinkMBB # Jump if Shadow Stack is not supported // fallMBB: // mov buf+24/12(%rip), vreg2 // sub vreg1, vreg2 // jbe sinkMBB # No need to fix the Shadow Stack // fixShadowMBB: // shr 3/2, vreg2 // incssp vreg2 # fix the SSP according to the lower 8 bits // shr 8, vreg2 // je sinkMBB // fixShadowLoopPrepareMBB: // shl vreg2 // mov 128, vreg3 // fixShadowLoopMBB: // incssp vreg3 // dec vreg2 // jne fixShadowLoopMBB # Iterate until you finish fixing // # the Shadow Stack // sinkMBB: MachineFunction::iterator I = ++MBB->getIterator(); const BasicBlock *BB = MBB->getBasicBlock(); MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, checkSspMBB); MF->insert(I, fallMBB); MF->insert(I, fixShadowMBB); MF->insert(I, fixShadowLoopPrepareMBB); MF->insert(I, fixShadowLoopMBB); MF->insert(I, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); MBB->addSuccessor(checkSspMBB); // Initialize a register with zero. unsigned ZReg = MRI.createVirtualRegister(PtrRC); unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr; BuildMI(checkSspMBB, DL, TII->get(XorRROpc)) .addDef(ZReg) .addReg(ZReg, RegState::Undef) .addReg(ZReg, RegState::Undef); // Read the current SSP Register value to the zeroed register. unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC); unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); // Check whether the result of the SSP register is zero and jump directly // to the sink. unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr; BuildMI(checkSspMBB, DL, TII->get(TestRROpc)) .addReg(SSPCopyReg) .addReg(SSPCopyReg); BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); checkSspMBB->addSuccessor(sinkMBB); checkSspMBB->addSuccessor(fallMBB); // Reload the previously saved SSP register value. unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; const int64_t SPPOffset = 3 * PVT.getStoreSize(); MachineInstrBuilder MIB = BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { const MachineOperand &MO = MI.getOperand(i); if (i == X86::AddrDisp) MIB.addDisp(MO, SPPOffset); else if (MO.isReg()) // Don't add the whole operand, we don't want to // preserve kill flags. MIB.addReg(MO.getReg()); else MIB.add(MO); } MIB.setMemRefs(MMOs); // Subtract the current SSP from the previous SSP. unsigned SspSubReg = MRI.createVirtualRegister(PtrRC); unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr; BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg) .addReg(PrevSSPReg) .addReg(SSPCopyReg); // Jump to sink in case PrevSSPReg <= SSPCopyReg. BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE); fallMBB->addSuccessor(sinkMBB); fallMBB->addSuccessor(fixShadowMBB); // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8. unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri; unsigned Offset = (PVT == MVT::i64) ? 3 : 2; unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg) .addReg(SspSubReg) .addImm(Offset); // Increase SSP when looking only on the lower 8 bits of the delta. unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD; BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg); // Reset the lower 8 bits. unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg) .addReg(SspFirstShrReg) .addImm(8); // Jump if the result of the shift is zero. BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); fixShadowMBB->addSuccessor(sinkMBB); fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB); // Do a single shift left. unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1; unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg) .addReg(SspSecondShrReg); // Save the value 128 to a register (will be used next with incssp). unsigned Value128InReg = MRI.createVirtualRegister(PtrRC); unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri; BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg) .addImm(128); fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB); // Since incssp only looks at the lower 8 bits, we might need to do several // iterations of incssp until we finish fixing the shadow stack. unsigned DecReg = MRI.createVirtualRegister(PtrRC); unsigned CounterReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg) .addReg(SspAfterShlReg) .addMBB(fixShadowLoopPrepareMBB) .addReg(DecReg) .addMBB(fixShadowLoopMBB); // Every iteration we increase the SSP by 128. BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg); // Every iteration we decrement the counter by 1. unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r; BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg); // Jump if the counter is not zero yet. BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE); fixShadowLoopMBB->addSuccessor(sinkMBB); fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB); return sinkMBB; } MachineBasicBlock * X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); const TargetRegisterClass *RC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); MachineInstrBuilder MIB; const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t SPOffset = 2 * PVT.getStoreSize(); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; MachineBasicBlock *thisMBB = MBB; // When CET and shadow stack is enabled, we need to fix the Shadow Stack. if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) { thisMBB = emitLongJmpShadowStackFix(MI, thisMBB); } // Reload FP MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { const MachineOperand &MO = MI.getOperand(i); if (MO.isReg()) // Don't add the whole operand, we don't want to // preserve kill flags. MIB.addReg(MO.getReg()); else MIB.add(MO); } MIB.setMemRefs(MMOs); // Reload IP MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { const MachineOperand &MO = MI.getOperand(i); if (i == X86::AddrDisp) MIB.addDisp(MO, LabelOffset); else if (MO.isReg()) // Don't add the whole operand, we don't want to // preserve kill flags. MIB.addReg(MO.getReg()); else MIB.add(MO); } MIB.setMemRefs(MMOs); // Reload SP MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), SPOffset); else MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's // the last instruction of the expansion. } MIB.setMemRefs(MMOs); // Jump BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); MI.eraseFromParent(); return thisMBB; } void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); unsigned Op = 0; unsigned VR = 0; bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && !isPositionIndependent(); if (UseImmLabel) { Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; } else { const TargetRegisterClass *TRC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; VR = MRI->createVirtualRegister(TRC); Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; if (Subtarget.is64Bit()) BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR) .addReg(X86::RIP) .addImm(1) .addReg(0) .addMBB(DispatchBB) .addReg(0); else BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR) .addReg(0) /* TII->getGlobalBaseReg(MF) */ .addImm(1) .addReg(0) .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference()) .addReg(0); } MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op)); addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36); if (UseImmLabel) MIB.addMBB(DispatchBB); else MIB.addReg(VR); } MachineBasicBlock * X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); int FI = MF->getFrameInfo().getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're // associated with. DenseMap> CallSiteNumToLPad; unsigned MaxCSNum = 0; for (auto &MBB : *MF) { if (!MBB.isEHPad()) continue; MCSymbol *Sym = nullptr; for (const auto &MI : MBB) { if (MI.isDebugInstr()) continue; assert(MI.isEHLabel() && "expected EH_LABEL"); Sym = MI.getOperand(0).getMCSymbol(); break; } if (!MF->hasCallSiteLandingPad(Sym)) continue; for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) { CallSiteNumToLPad[CSI].push_back(&MBB); MaxCSNum = std::max(MaxCSNum, CSI); } } // Get an ordered list of the machine basic blocks for the jump table. std::vector LPadList; SmallPtrSet InvokeBBs; LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) { for (auto &LP : CallSiteNumToLPad[CSI]) { LPadList.push_back(LP); InvokeBBs.insert(LP->pred_begin(), LP->pred_end()); } } assert(!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"); // Create the MBBs for the dispatch code. // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); DispatchBB->setIsEHPad(true); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); BuildMI(TrapBB, DL, TII->get(X86::TRAP)); DispatchBB->addSuccessor(TrapBB); MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); DispatchBB->addSuccessor(DispContBB); // Insert MBBs. MF->push_back(DispatchBB); MF->push_back(DispContBB); MF->push_back(TrapBB); // Insert code into the entry block that creates and registers the function // context. SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI); // Create the jump table and associated information unsigned JTE = getJumpTableEncoding(); MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE); unsigned MJTI = JTI->createJumpTableIndex(LPadList); const X86RegisterInfo &RI = TII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. if (RI.hasBasePointer(*MF)) { const bool FPIs64Bit = Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); X86MachineFunctionInfo *MFI = MF->getInfo(); MFI->setRestoreBasePointer(MF); unsigned FP = RI.getFrameRegister(*MF); unsigned BP = RI.getBaseRegister(); unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true, MFI->getRestoreBasePointerOffset()) .addRegMask(RI.getNoPreservedMask()); } else { BuildMI(DispatchBB, DL, TII->get(X86::NOOP)) .addRegMask(RI.getNoPreservedMask()); } // IReg is used as an index in a memory operand and therefore can't be SP unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass); addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI, Subtarget.is64Bit() ? 8 : 4); BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) .addReg(IReg) .addImm(LPadList.size()); BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE); if (Subtarget.is64Bit()) { unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass); unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); // leaq .LJTI0_0(%rip), BReg BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg) .addReg(X86::RIP) .addImm(1) .addReg(0) .addJumpTableIndex(MJTI) .addReg(0); // movzx IReg64, IReg BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64) .addImm(0) .addReg(IReg) .addImm(X86::sub_32bit); switch (JTE) { case MachineJumpTableInfo::EK_BlockAddress: // jmpq *(BReg,IReg64,8) BuildMI(DispContBB, DL, TII->get(X86::JMP64m)) .addReg(BReg) .addImm(8) .addReg(IReg64) .addImm(0) .addReg(0); break; case MachineJumpTableInfo::EK_LabelDifference32: { unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass); unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass); unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass); // movl (BReg,IReg64,4), OReg BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg) .addReg(BReg) .addImm(4) .addReg(IReg64) .addImm(0) .addReg(0); // movsx OReg64, OReg BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg); // addq BReg, OReg64, TReg BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg) .addReg(OReg64) .addReg(BReg); // jmpq *TReg BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg); break; } default: llvm_unreachable("Unexpected jump table encoding"); } } else { // jmpl *.LJTI0_0(,IReg,4) BuildMI(DispContBB, DL, TII->get(X86::JMP32m)) .addReg(0) .addImm(4) .addReg(IReg) .addJumpTableIndex(MJTI) .addReg(0); } // Add the jump table entries as successors to the MBB. SmallPtrSet SeenMBBs; for (auto &LP : LPadList) if (SeenMBBs.insert(LP).second) DispContBB->addSuccessor(LP); // N.B. the order the invoke BBs are processed in doesn't matter here. SmallVector MBBLPads; const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs(); for (MachineBasicBlock *MBB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. // Keep a copy of Successors since it's modified inside the loop. SmallVector Successors(MBB->succ_rbegin(), MBB->succ_rend()); // FIXME: Avoid quadratic complexity. for (auto MBBS : Successors) { if (MBBS->isEHPad()) { MBB->removeSuccessor(MBBS); MBBLPads.push_back(MBBS); } } MBB->addSuccessor(DispatchBB); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from // moving instructions to before the EH block, where they will never be // executed. for (auto &II : reverse(*MBB)) { if (!II.isCall()) continue; DenseMap DefRegs; for (auto &MOp : II.operands()) if (MOp.isReg()) DefRegs[MOp.getReg()] = true; MachineInstrBuilder MIB(*MF, &II); for (unsigned RI = 0; SavedRegs[RI]; ++RI) { unsigned Reg = SavedRegs[RI]; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); } break; } } // Mark all former landing pads as non-landing pads. The dispatch is the only // landing pad now. for (auto &LP : MBBLPads) LP->setIsEHPad(false); // The instruction is gone now. MI.eraseFromParent(); return BB; } MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TLS_addr32: case X86::TLS_addr64: case X86::TLS_base_addr32: case X86::TLS_base_addr64: return EmitLoweredTLSAddr(MI, BB); case X86::RETPOLINE_CALL32: case X86::RETPOLINE_CALL64: case X86::RETPOLINE_TCRETURN32: case X86::RETPOLINE_TCRETURN64: return EmitLoweredRetpoline(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::CATCHPAD: return EmitLoweredCatchPad(MI, BB); case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR32: case X86::CMOV_FR32X: case X86::CMOV_FR64: case X86::CMOV_FR64X: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: case X86::CMOV_VR128: case X86::CMOV_VR128X: case X86::CMOV_VR256: case X86::CMOV_VR256X: case X86::CMOV_VR512: case X86::CMOV_VK2: case X86::CMOV_VK4: case X86::CMOV_VK8: case X86::CMOV_VK16: case X86::CMOV_VK32: case X86::CMOV_VK64: return EmitLoweredSelect(MI, BB); case X86::RDFLAGS32: case X86::RDFLAGS64: { unsigned PushF = MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF)); // Permit reads of the EFLAGS and DF registers without them being defined. // This intrinsic exists to read external processor state in flags, such as // the trap flag, interrupt flag, and direction flag, none of which are // modeled by the backend. assert(Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"); Push->getOperand(2).setIsUndef(); assert(Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"); Push->getOperand(3).setIsUndef(); BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg()); MI.eraseFromParent(); // The pseudo is gone now. return BB; } case X86::WRFLAGS32: case X86::WRFLAGS64: { unsigned Push = MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; unsigned PopF = MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg()); BuildMI(*BB, MI, DL, TII->get(PopF)); MI.eraseFromParent(); // The pseudo is gone now. return BB; } case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: case X86::FP64_TO_INT16_IN_MEM: case X86::FP64_TO_INT32_IN_MEM: case X86::FP64_TO_INT64_IN_MEM: case X86::FP80_TO_INT16_IN_MEM: case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); // Load the old value of the control word... unsigned OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW), OrigCWFrameIdx); // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero. unsigned NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW) .addReg(OldCW, RegState::Kill).addImm(0xC00); // Extract to 16 bits. unsigned NewCW16 = MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16) .addReg(NewCW, RegState::Kill, X86::sub_16bit); // Prepare memory for FLDCW. int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), NewCWFrameIdx) .addReg(NewCW16, RegState::Kill); // Reload the modified control word now... addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), NewCWFrameIdx); // Get the X86 opcode to use. unsigned Opc; switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; } X86AddressMode AM = getAddressFromInstr(&MI, 0); addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) .addReg(MI.getOperand(X86::AddrNumOperands).getReg()); // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), OrigCWFrameIdx); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } // xbegin case X86::XBEGIN: return emitXBegin(MI, BB, Subtarget.getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); case X86::VAARG_64: return EmitVAARG64WithCustomInserter(MI, BB); case X86::EH_SjLj_SetJmp32: case X86::EH_SjLj_SetJmp64: return emitEHSjLjSetJmp(MI, BB); case X86::EH_SjLj_LongJmp32: case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); case X86::Int_eh_sjlj_setup_dispatch: return EmitSjLjDispatchBlock(MI, BB); case TargetOpcode::STATEPOINT: // As an implementation detail, STATEPOINT shares the STACKMAP format at // this point in the process. We diverge later. return emitPatchPoint(MI, BB); case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); case TargetOpcode::PATCHABLE_EVENT_CALL: return emitXRayCustomEvent(MI, BB); case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: return emitXRayTypedEvent(MI, BB); case X86::LCMPXCHG8B: { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B // requires a memory operand. If it happens that current architecture is // i686 and for current function we need a base pointer // - which is ESI for i686 - register allocator would not be able to // allocate registers for an address in form of X(%reg, %reg, Y) // - there never would be enough unreserved registers during regalloc // (without the need for base ptr the only option would be X(%edi, %esi, Y). // We are giving a hand to register allocator by precomputing the address in // a new vreg using LEA. // If it is not i686 or there is no base pointer - nothing to do here. if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF)) return BB; // Even though this code does not necessarily needs the base pointer to // be ESI, we check for that. The reason: if this assert fails, there are // some changes happened in the compiler base pointer handling, which most // probably have to be addressed somehow here. assert(TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " "base pointer in mind"); MachineRegisterInfo &MRI = MF->getRegInfo(); MVT SPTy = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass); X86AddressMode AM = getAddressFromInstr(&MI, 0); // Regalloc does not need any help when the memory operand of CMPXCHG8B // does not use index register. if (AM.IndexReg == X86::NoRegister) return BB; // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its // four operand definitions that are E[ABCD] registers. We skip them and // then insert the LEA. MachineBasicBlock::iterator MBBI(MI); while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) || MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX)) --MBBI; addFullAddress( BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM); setDirectAddressInInstr(&MI, 0, computedAddrVReg); return BB; } case X86::LCMPXCHG16B: return BB; case X86::LCMPXCHG8B_SAVE_EBX: case X86::LCMPXCHG16B_SAVE_RBX: { unsigned BasePtr = MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX; if (!BB->isLiveIn(BasePtr)) BB->addLiveIn(BasePtr); return BB; } } } //===----------------------------------------------------------------------===// // X86 Optimization Hooks //===----------------------------------------------------------------------===// bool X86TargetLowering::targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { // Only optimize Ands to prevent shrinking a constant that could be // matched by movzx. if (Op.getOpcode() != ISD::AND) return false; EVT VT = Op.getValueType(); // Ignore vectors. if (VT.isVector()) return false; unsigned Size = VT.getSizeInBits(); // Make sure the RHS really is a constant. ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) return false; const APInt &Mask = C->getAPIntValue(); // Clear all non-demanded bits initially. APInt ShrunkMask = Mask & Demanded; // Find the width of the shrunk mask. unsigned Width = ShrunkMask.getActiveBits(); // If the mask is all 0s there's nothing to do here. if (Width == 0) return false; // Find the next power of 2 width, rounding up to a byte. Width = PowerOf2Ceil(std::max(Width, 8U)); // Truncate the width to size to handle illegal types. Width = std::min(Width, Size); // Calculate a possible zero extend mask for this constant. APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width); // If we aren't changing the mask, just return true to keep it and prevent // the caller from optimizing. if (ZeroExtendMask == Mask) return true; // Make sure the new mask can be represented by a combination of mask bits // and non-demanded bits. if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded)) return false; // Replace the constant with the zero extend mask. SDLoc DL(Op); SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT); SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); return TLO.CombineTo(Op, NewOp); } void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = Known.getBitWidth(); unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); Known.resetAll(); switch (Opc) { default: break; case X86ISD::SETCC: Known.Zero.setBitsFrom(1); break; case X86ISD::MOVMSK: { unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); Known.Zero.setBitsFrom(NumLoBits); break; } case X86ISD::PEXTRB: case X86ISD::PEXTRW: { SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(), Op.getConstantOperandVal(1)); Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1); Known = Known.zextOrTrunc(BitWidth, false); Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits()); break; } case X86ISD::VSRAI: case X86ISD::VSHLI: case X86ISD::VSRLI: { if (auto *ShiftImm = dyn_cast(Op.getOperand(1))) { if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) { Known.setAllZero(); break; } Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); unsigned ShAmt = ShiftImm->getZExtValue(); if (Opc == X86ISD::VSHLI) { Known.Zero <<= ShAmt; Known.One <<= ShAmt; // Low bits are known zero. Known.Zero.setLowBits(ShAmt); } else if (Opc == X86ISD::VSRLI) { Known.Zero.lshrInPlace(ShAmt); Known.One.lshrInPlace(ShAmt); // High bits are known zero. Known.Zero.setHighBits(ShAmt); } else { Known.Zero.ashrInPlace(ShAmt); Known.One.ashrInPlace(ShAmt); } } break; } case X86ISD::PACKUS: { // PACKUS is just a truncation if the upper half is zero. APInt DemandedLHS, DemandedRHS; getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); Known.One = APInt::getAllOnesValue(BitWidth * 2); Known.Zero = APInt::getAllOnesValue(BitWidth * 2); KnownBits Known2; if (!!DemandedLHS) { Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1); Known.One &= Known2.One; Known.Zero &= Known2.Zero; } if (!!DemandedRHS) { Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1); Known.One &= Known2.One; Known.Zero &= Known2.Zero; } if (Known.countMinLeadingZeros() < BitWidth) Known.resetAll(); Known = Known.trunc(BitWidth); break; } case X86ISD::ANDNP: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); // ANDNP = (~X & Y); Known.One &= Known2.Zero; Known.Zero |= Known2.One; break; } case X86ISD::FOR: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); // Output known-0 bits are only known if clear in both the LHS & RHS. Known.Zero &= Known2.Zero; // Output known-1 are known to be set if set in either the LHS | RHS. Known.One |= Known2.One; break; } case X86ISD::CMOV: { Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1); // If we don't know any bits, early out. if (Known.isUnknown()) break; KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1); // Only known if known in both the LHS and RHS. Known.One &= Known2.One; Known.Zero &= Known2.Zero; break; } } // Handle target shuffles. // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. if (isTargetShuffle(Opc)) { bool IsUnary; SmallVector Mask; SmallVector Ops; if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask, IsUnary)) { unsigned NumOps = Ops.size(); unsigned NumElts = VT.getVectorNumElements(); if (Mask.size() == NumElts) { SmallVector DemandedOps(NumOps, APInt(NumElts, 0)); Known.Zero.setAllBits(); Known.One.setAllBits(); for (unsigned i = 0; i != NumElts; ++i) { if (!DemandedElts[i]) continue; int M = Mask[i]; if (M == SM_SentinelUndef) { // For UNDEF elements, we don't know anything about the common state // of the shuffle result. Known.resetAll(); break; } else if (M == SM_SentinelZero) { Known.One.clearAllBits(); continue; } assert(0 <= M && (unsigned)M < (NumOps * NumElts) && "Shuffle index out of range"); unsigned OpIdx = (unsigned)M / NumElts; unsigned EltIdx = (unsigned)M % NumElts; if (Ops[OpIdx].getValueType() != VT) { // TODO - handle target shuffle ops with different value types. Known.resetAll(); break; } DemandedOps[OpIdx].setBit(EltIdx); } // Known bits are the values that are shared by every demanded element. for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) { if (!DemandedOps[i]) continue; KnownBits Known2 = DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1); Known.One &= Known2.One; Known.Zero &= Known2.Zero; } } } } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { EVT VT = Op.getValueType(); unsigned VTBits = VT.getScalarSizeInBits(); unsigned Opcode = Op.getOpcode(); switch (Opcode) { case X86ISD::SETCC_CARRY: // SETCC_CARRY sets the dest to ~0 for true or 0 for false. return VTBits; case X86ISD::VTRUNC: { // TODO: Add DemandedElts support. SDValue Src = Op.getOperand(0); unsigned NumSrcBits = Src.getScalarValueSizeInBits(); assert(VTBits < NumSrcBits && "Illegal truncation input type"); unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); if (Tmp > (NumSrcBits - VTBits)) return Tmp - (NumSrcBits - VTBits); return 1; } case X86ISD::PACKSS: { // PACKSS is just a truncation if the sign bits extend to the packed size. APInt DemandedLHS, DemandedRHS; getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS, DemandedRHS); unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits(); unsigned Tmp0 = SrcBits, Tmp1 = SrcBits; if (!!DemandedLHS) Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1); if (!!DemandedRHS) Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1); unsigned Tmp = std::min(Tmp0, Tmp1); if (Tmp > (SrcBits - VTBits)) return Tmp - (SrcBits - VTBits); return 1; } case X86ISD::VSHLI: { SDValue Src = Op.getOperand(0); const APInt &ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits)) return VTBits; // Shifted all bits out --> zero. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); if (ShiftVal.uge(Tmp)) return 1; // Shifted all sign bits out --> unknown. return Tmp - ShiftVal.getZExtValue(); } case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); APInt ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits - 1)) return VTBits; // Sign splat. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); ShiftVal += Tmp; return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue(); } case X86ISD::PCMPGT: case X86ISD::PCMPEQ: case X86ISD::CMPP: case X86ISD::VPCOM: case X86ISD::VPCOMU: // Vector compares return zero/all-bits result values. return VTBits; case X86ISD::ANDNP: { unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); if (Tmp0 == 1) return 1; // Early out. unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1); return std::min(Tmp0, Tmp1); } case X86ISD::CMOV: { unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp0 == 1) return 1; // Early out. unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1); return std::min(Tmp0, Tmp1); } } // Handle target shuffles. // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. if (isTargetShuffle(Opcode)) { bool IsUnary; SmallVector Mask; SmallVector Ops; if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask, IsUnary)) { unsigned NumOps = Ops.size(); unsigned NumElts = VT.getVectorNumElements(); if (Mask.size() == NumElts) { SmallVector DemandedOps(NumOps, APInt(NumElts, 0)); for (unsigned i = 0; i != NumElts; ++i) { if (!DemandedElts[i]) continue; int M = Mask[i]; if (M == SM_SentinelUndef) { // For UNDEF elements, we don't know anything about the common state // of the shuffle result. return 1; } else if (M == SM_SentinelZero) { // Zero = all sign bits. continue; } assert(0 <= M && (unsigned)M < (NumOps * NumElts) && "Shuffle index out of range"); unsigned OpIdx = (unsigned)M / NumElts; unsigned EltIdx = (unsigned)M % NumElts; if (Ops[OpIdx].getValueType() != VT) { // TODO - handle target shuffle ops with different value types. return 1; } DemandedOps[OpIdx].setBit(EltIdx); } unsigned Tmp0 = VTBits; for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) { if (!DemandedOps[i]) continue; unsigned Tmp1 = DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1); Tmp0 = std::min(Tmp0, Tmp1); } return Tmp0; } } } // Fallback case. return 1; } SDValue X86TargetLowering::unwrapAddress(SDValue N) const { if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP) return N->getOperand(0); return N; } // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryShuffle(MVT MaskVT, ArrayRef Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); // Match against a VZEXT_MOVL vXi32 zero-extending instruction. if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) && isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) { Shuffle = X86ISD::VZEXT_MOVL; SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; return true; } // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { bool MatchAny = true; bool MatchZero = true; unsigned NumDstElts = NumMaskElts / Scale; for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) { if (!isUndefOrEqual(Mask[i * Scale], (int)i)) { MatchAny = MatchZero = false; break; } MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1); MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); } if (MatchAny || MatchZero) { assert(MatchZero && "Failed to match zext but matched aext?"); unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() : MVT::getIntegerVT(MaskEltSize); SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND); if (SrcVT.getVectorNumElements() != NumDstElts) Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); return true; } } } // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && isUndefOrEqual(Mask[0], 0) && isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { Shuffle = X86ISD::VZEXT_MOVL; SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; return true; } // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; } if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v4f32; return true; } } if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v4f64; return true; } if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v8f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v8f32; return true; } } if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v8f64; return true; } if (isTargetShuffleEquivalent( Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v16f32; return true; } if (isTargetShuffleEquivalent( Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v16f32; return true; } } return false; } // Attempt to match a combined shuffle mask against supported unary immediate // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); unsigned InputSizeInBits = MaskVT.getSizeInBits(); unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); bool ContainsZeros = llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns. if (!ContainsZeros && MaskScalarSizeInBits == 64) { // Check for lane crossing permutes. if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) { Shuffle = X86ISD::VPERMI; ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); PermuteImm = getV4X86ShuffleImm(Mask); return true; } if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) { SmallVector RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { Shuffle = X86ISD::VPERMI; ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); PermuteImm = getV4X86ShuffleImm(RepeatedMask); return true; } } } else if (AllowFloatDomain && Subtarget.hasAVX()) { // VPERMILPD can permute with a non-repeating shuffle. Shuffle = X86ISD::VPERMILPI; ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); PermuteImm = 0; for (int i = 0, e = Mask.size(); i != e; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); PermuteImm |= (M & 1) << i; } return true; } } // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { // Narrow the repeated mask to create 32-bit element permutes. SmallVector WordMask = RepeatedMask; if (MaskScalarSizeInBits == 64) scaleShuffleMask(2, RepeatedMask, WordMask); Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); PermuteImm = getV4X86ShuffleImm(WordMask); return true; } } // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { ArrayRef LoMask(Mask.data() + 0, 4); ArrayRef HiMask(Mask.data() + 4, 4); // PSHUFLW: permute lower 4 elements only. if (isUndefOrInRange(LoMask, 0, 4) && isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { Shuffle = X86ISD::PSHUFLW; ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); PermuteImm = getV4X86ShuffleImm(LoMask); return true; } // PSHUFHW: permute upper 4 elements only. if (isUndefOrInRange(HiMask, 4, 8) && isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { // Offset the HiMask so that we can create the shuffle immediate. int OffsetHiMask[4]; for (int i = 0; i != 4; ++i) OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); Shuffle = X86ISD::PSHUFHW; ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); PermuteImm = getV4X86ShuffleImm(OffsetHiMask); return true; } } } // Attempt to match against byte/bit shifts. // FIXME: Add 512-bit support. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0, Zeroable, Subtarget); if (0 < ShiftAmt) { PermuteImm = (unsigned)ShiftAmt; return true; } } return false; } // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchBinaryShuffle(MVT MaskVT, ArrayRef Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) { V2 = V1; V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1); Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) { V2 = V1; Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; SrcVT = DstVT = MVT::v2f64; return true; } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; SrcVT = DstVT = MVT::v4f32; return true; } } // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle. if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) || ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) || ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) { if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, Subtarget)) { DstVT = MaskVT; return true; } } // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle. if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || (MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG, Subtarget)) { SrcVT = DstVT = MaskVT; if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); return true; } } return false; } static bool matchBinaryPermuteShuffle( MVT MaskVT, ArrayRef Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); // Attempt to match against PALIGNR byte rotate. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8); PermuteImm = ByteRotation; return true; } } // Attempt to combine to X86ISD::BLENDI. if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || (Subtarget.hasAVX() && MaskVT.is256BitVector()))) || (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) { uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector TargetMask(Mask.begin(), Mask.end()); if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero, BlendMask)) { if (MaskVT == MVT::v16i16) { // We can only use v16i16 PBLENDW if the lanes are repeated. SmallVector RepeatedMask; if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask, RepeatedMask)) { assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); PermuteImm = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) PermuteImm |= 1 << i; V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::BLENDI; ShuffleVT = MaskVT; return true; } } else { V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; ShuffleVT = MaskVT; return true; } } } // Attempt to combine to INSERTPS. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector()) { if (Zeroable.getBoolValue() && matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; return true; } } // Attempt to combine to SHUFPD. if (AllowFloatDomain && EltSizeInBits == 64 && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; } } // Attempt to combine to SHUFPS. if (AllowFloatDomain && EltSizeInBits == 32 && ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { SmallVector RepeatedMask; if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { // Match each half of the repeated mask, to determine if its just // referencing one of the vectors, is zeroable or entirely undef. auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { int M0 = RepeatedMask[Offset]; int M1 = RepeatedMask[Offset + 1]; if (isUndefInRange(RepeatedMask, Offset, 2)) { return DAG.getUNDEF(MaskVT); } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) { S0 = (SM_SentinelUndef == M0 ? -1 : 0); S1 = (SM_SentinelUndef == M1 ? -1 : 1); return getZeroVector(MaskVT, Subtarget, DAG, DL); } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) { S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); return V1; } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) { S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); return V2; } return SDValue(); }; int ShufMask[4] = {-1, -1, -1, -1}; SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]); SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]); if (Lo && Hi) { V1 = Lo; V2 = Hi; Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32); PermuteImm = getV4X86ShuffleImm(ShufMask); return true; } } } return false; } static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget); /// Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// /// This is the leaf of the recursive combine below. When we have found some /// chain of single-use x86 shuffle instructions and accumulated the combined /// shuffle mask represented by them, this will try to pattern match that mask /// into either a single instruction if there is a special purpose instruction /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); assert((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"); // Find the inputs that enter the chain. Note that multiple uses are OK // here, we're not going to remove the operands we find. bool UnaryShuffle = (Inputs.size() == 1); SDValue V1 = peekThroughBitcasts(Inputs[0]); SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType()) : peekThroughBitcasts(Inputs[1])); MVT VT1 = V1.getSimpleValueType(); MVT VT2 = V2.getSimpleValueType(); MVT RootVT = Root.getSimpleValueType(); assert(VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"); SDLoc DL(Root); SDValue Res; unsigned NumBaseMaskElts = BaseMask.size(); if (NumBaseMaskElts == 1) { assert(BaseMask[0] == 0 && "Invalid shuffle index found!"); return DAG.getBitcast(RootVT, V1); } unsigned RootSizeInBits = RootVT.getSizeInBits(); unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() || (RootVT.isFloatingPoint() && Depth >= 2) || (RootVT.is256BitVector() && !Subtarget.hasAVX2()); // Don't combine if we are a AVX512/EVEX target and the mask element size // is different from the root element size - this would prevent writemasks // from being reused. // TODO - this currently prevents all lane shuffles from occurring. // TODO - check for writemasks usage instead of always preventing combining. // TODO - attempt to narrow Mask back to writemask size. bool IsEVEXShuffle = RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); // Attempt to match a subvector broadcast. // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0) if (UnaryShuffle && (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) { SmallVector BroadcastMask(NumBaseMaskElts, 0); if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) { SDValue Src = Inputs[0]; if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && Src.getOperand(0).isUndef() && Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits && MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) { return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL, Src.getValueType(), Src.getOperand(1))); } } } // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. // Handle 128-bit lane shuffles of 256-bit vectors. // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless // we need to use the zeroing feature. // TODO - this should support binary shuffles. if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 && !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) && !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); unsigned PermMask = 0; PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); Res = DAG.getBitcast(ShuffleVT, V1); Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, DAG.getUNDEF(ShuffleVT), DAG.getConstant(PermMask, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } // For masks that have been widened to 128-bit elements or more, // narrow back down to 64-bit elements. SmallVector Mask; if (BaseMaskEltSizeInBits > 64) { assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"); int MaskScale = BaseMaskEltSizeInBits / 64; scaleShuffleMask(MaskScale, BaseMask, Mask); } else { Mask = SmallVector(BaseMask.begin(), BaseMask.end()); } unsigned NumMaskElts = Mask.size(); unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts; // Determine the effective mask value type. FloatDomain &= (32 <= MaskEltSizeInBits); MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits) : MVT::getIntegerVT(MaskEltSizeInBits); MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts); // Only allow legal mask types. if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) return SDValue(); // Attempt to match the mask against known shuffle patterns. MVT ShuffleSrcVT, ShuffleVT; unsigned Shuffle, PermuteImm; // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. // TODO: Should we indicate which domain is preferred if both are allowed? bool AllowFloatDomain = FloatDomain || (Depth > 3); bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() && (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); // Determine zeroable mask elements. APInt Zeroable(NumMaskElts, 0); for (unsigned i = 0; i != NumMaskElts; ++i) if (isUndefOrZero(Mask[i])) Zeroable.setBit(i); if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load // directly if we don't shuffle the lower element and we shuffle the upper // (zero) elements within themselves. if (V1.getOpcode() == X86ISD::VZEXT_LOAD && (cast(V1)->getMemoryVT().getScalarSizeInBits() % MaskEltSizeInBits) == 0) { unsigned Scale = cast(V1)->getMemoryVT().getScalarSizeInBits() / MaskEltSizeInBits; ArrayRef HiMask(Mask.data() + Scale, NumMaskElts - Scale); if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { return DAG.getBitcast(RootVT, V1); } } // Attempt to match against broadcast-from-vector. // Limit AVX1 to cases where we're loading+broadcasting a scalar element. if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) { SmallVector BroadcastMask(NumMaskElts, 0); if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { if (V1.getValueType() == MaskVT && V1.getOpcode() == ISD::SCALAR_TO_VECTOR && MayFoldLoad(V1.getOperand(0))) { if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = V1.getOperand(0); Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); return DAG.getBitcast(RootVT, Res); } if (Subtarget.hasAVX2()) { if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = DAG.getBitcast(MaskVT, V1); Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); return DAG.getBitcast(RootVT, Res); } } } SDValue NewV1 = V1; // Save operand in case early exit happens. if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleSrcVT, NewV1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); return DAG.getBitcast(RootVT, Res); } if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleVT, V1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, DAG.getConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } SDValue NewV1 = V1; // Save operands in case early exit happens. SDValue NewV2 = V2; if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT, UnaryShuffle) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1); NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2); Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2); return DAG.getBitcast(RootVT, Res); } NewV1 = V1; // Save operands in case early exit happens. NewV2 = V2; if (matchBinaryPermuteShuffle( MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleVT, NewV1); NewV2 = DAG.getBitcast(ShuffleVT, NewV2); Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2, DAG.getConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } // Typically from here on, we need an integer version of MaskVT. MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits); IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts); // Annoyingly, SSE4A instructions don't map into the above match helpers. if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) { if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); V2 = DAG.getBitcast(IntMaskVT, V2); Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) return SDValue(); // Depth threshold above which we can efficiently use variable mask shuffles. int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3; AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask; bool MaskContainsZeros = any_of(Mask, [](int M) { return M == SM_SentinelZero; }); if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX2() && (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); Res = DAG.getBitcast(MaskVT, V1); Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res); return DAG.getBitcast(RootVT, Res); } // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero // vector as the second source. if (UnaryShuffle && AllowVariableMask && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasVLX() && (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { // Adjust shuffle mask - replace SM_SentinelZero with second source index. for (unsigned i = 0; i != NumMaskElts; ++i) if (Mask[i] == SM_SentinelZero) Mask[i] = NumMaskElts + i; SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); Res = DAG.getBitcast(MaskVT, V1); SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL); Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero); return DAG.getBitcast(RootVT, Res); } // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input lane-crossing shuffle then lower to VPERMV3. if (AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasVLX() && (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); V1 = DAG.getBitcast(MaskVT, V1); V2 = DAG.getBitcast(MaskVT, V2); Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); return DAG.getBitcast(RootVT, Res); } return SDValue(); } // See if we can combine a single input shuffle with zeros to a bit-mask, // which is much simpler than any shuffle. if (UnaryShuffle && MaskContainsZeros && AllowVariableMask && isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) && DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { APInt Zero = APInt::getNullValue(MaskEltSizeInBits); APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits); APInt UndefElts(NumMaskElts, 0); SmallVector EltBits(NumMaskElts, Zero); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { UndefElts.setBit(i); continue; } if (M == SM_SentinelZero) continue; EltBits[i] = AllOnes; } SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL); Res = DAG.getBitcast(MaskVT, V1); unsigned AndOpcode = FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND); Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask); return DAG.getBitcast(RootVT, Res); } // If we have a single input shuffle with different shuffle patterns in the // the 128-bit lanes use the variable mask to VPERMILPS. // TODO Combine other mask types at higher depths. if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { SmallVector VPermIdx; for (int M : Mask) { SDValue Idx = M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); VPermIdx.push_back(Idx); } SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx); Res = DAG.getBitcast(MaskVT, V1); Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); return DAG.getBitcast(RootVT, Res); } // With XOP, binary shuffles of 128/256-bit floating point vectors can combine // to VPERMIL2PD/VPERMIL2PS. if (AllowVariableMask && Subtarget.hasXOP() && (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v8f32)) { // VPERMIL2 Operation. // Bits[3] - Match Bit. // Bits[2:1] - (Per Lane) PD Shuffle Mask. // Bits[2:0] - (Per Lane) PS Shuffle Mask. unsigned NumLanes = MaskVT.getSizeInBits() / 128; unsigned NumEltsPerLane = NumMaskElts / NumLanes; SmallVector VPerm2Idx; unsigned M2ZImm = 0; for (int M : Mask) { if (M == SM_SentinelUndef) { VPerm2Idx.push_back(-1); continue; } if (M == SM_SentinelZero) { M2ZImm = 2; VPerm2Idx.push_back(8); continue; } int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane); Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index); VPerm2Idx.push_back(Index); } V1 = DAG.getBitcast(MaskVT, V1); V2 = DAG.getBitcast(MaskVT, V2); SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, DAG.getConstant(M2ZImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } // If we have 3 or more shuffle instructions or a chain involving a variable // mask, we can replace them with a single PSHUFB instruction profitably. // Intel's manuals suggest only using PSHUFB if doing so replacing 5 // instructions, but in practice PSHUFB tends to be *very* fast so we're // more aggressive. if (UnaryShuffle && AllowVariableMask && ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) || (RootVT.is256BitVector() && Subtarget.hasAVX2()) || (RootVT.is512BitVector() && Subtarget.hasBWI()))) { SmallVector PSHUFBMask; int NumBytes = RootVT.getSizeInBits() / 8; int Ratio = NumBytes / NumMaskElts; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Ratio]; if (M == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } if (M == SM_SentinelZero) { PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8)); continue; } M = Ratio * M + i % Ratio; assert((M / 16) == (i / 16) && "Lane crossing detected"); PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); Res = DAG.getBitcast(ByteVT, V1); SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask); Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp); return DAG.getBitcast(RootVT, Res); } // With XOP, if we have a 128-bit binary input shuffle we can always combine // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never // slower than PSHUFB on targets that support both. if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) { // VPPERM Mask Operation // Bits[4:0] - Byte Index (0 - 31) // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO) SmallVector VPPERMMask; int NumBytes = 16; int Ratio = NumBytes / NumMaskElts; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Ratio]; if (M == SM_SentinelUndef) { VPPERMMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } if (M == SM_SentinelZero) { VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8)); continue; } M = Ratio * M + i % Ratio; VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::v16i8; V1 = DAG.getBitcast(ByteVT, V1); V2 = DAG.getBitcast(ByteVT, V2); SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask); Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp); return DAG.getBitcast(RootVT, Res); } // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input shuffle then lower to VPERMV3. if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasVLX() && (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || (Subtarget.hasBWI() && Subtarget.hasVLX() && (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) { SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); V1 = DAG.getBitcast(MaskVT, V1); V2 = DAG.getBitcast(MaskVT, V2); Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); return DAG.getBitcast(RootVT, Res); } // Failed to find any combines. return SDValue(); } // Combine an arbitrary chain of shuffles + extract_subvectors into a single // instruction if possible. // // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger // type size to attempt to combine: // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1) // --> // extract_subvector(shuffle(x,y,m2),0) static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NumMaskElts = BaseMask.size(); unsigned NumInputs = Inputs.size(); if (NumInputs == 0) return SDValue(); SmallVector WideInputs(Inputs.begin(), Inputs.end()); SmallVector Offsets(NumInputs, 0); // Peek through subvectors. // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs? unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits(); for (unsigned i = 0; i != NumInputs; ++i) { SDValue &Src = WideInputs[i]; unsigned &Offset = Offsets[i]; Src = peekThroughBitcasts(Src); EVT BaseVT = Src.getValueType(); while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR && isa(Src.getOperand(1))) { Offset += Src.getConstantOperandVal(1); Src = Src.getOperand(0); } WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits()); assert((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"); Offset /= BaseVT.getVectorNumElements(); Offset *= NumMaskElts; } // Bail if we're always extracting from the lowest subvectors, // combineX86ShuffleChain should match this for the current width. if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; })) return SDValue(); EVT RootVT = Root.getValueType(); unsigned RootSizeInBits = RootVT.getSizeInBits(); unsigned Scale = WideSizeInBits / RootSizeInBits; assert((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"); // If the src vector types aren't the same, see if we can extend // them to match each other. // TODO: Support different scalar types? EVT WideSVT = WideInputs[0].getValueType().getScalarType(); if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) { return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) || Op.getValueType().getScalarType() != WideSVT; })) return SDValue(); for (SDValue &NewInput : WideInputs) { assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && "Shuffle vector size mismatch"); if (WideSizeInBits > NewInput.getValueSizeInBits()) NewInput = widenSubVector(NewInput, false, Subtarget, DAG, SDLoc(NewInput), WideSizeInBits); assert(WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"); } // Create new mask for larger type. for (unsigned i = 1; i != NumInputs; ++i) Offsets[i] += i * Scale * NumMaskElts; SmallVector WideMask(BaseMask.begin(), BaseMask.end()); for (int &M : WideMask) { if (M < 0) continue; M = (M % NumMaskElts) + Offsets[M / NumMaskElts]; } WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); // Remove unused/repeated shuffle source ops. resolveTargetShuffleInputsAndMask(WideInputs, WideMask); assert(!WideInputs.empty() && "Shuffle with no inputs detected"); if (WideInputs.size() > 2) return SDValue(); // Increase depth for every upper subvector we've peeked through. Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; }); // Attempt to combine wider chain. // TODO: Can we use a better Root? SDValue WideRoot = WideInputs[0]; if (SDValue WideShuffle = combineX86ShuffleChain( WideInputs, WideRoot, WideMask, Depth, HasVariableMask, AllowVariableMask, DAG, Subtarget)) { WideShuffle = extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); return DAG.getBitcast(RootVT, WideShuffle); } return SDValue(); } // Attempt to constant fold all of the constant source ops. // Returns true if the entire shuffle is folded to a constant. // TODO: Extend this to merge multiple constant Ops and update the mask. static SDValue combineX86ShufflesConstants(ArrayRef Ops, ArrayRef Mask, SDValue Root, bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Root.getSimpleValueType(); unsigned SizeInBits = VT.getSizeInBits(); unsigned NumMaskElts = Mask.size(); unsigned MaskSizeInBits = SizeInBits / NumMaskElts; unsigned NumOps = Ops.size(); // Extract constant bits from each source op. bool OneUseConstantOp = false; SmallVector UndefEltsOps(NumOps); SmallVector, 16> RawBitsOps(NumOps); for (unsigned i = 0; i != NumOps; ++i) { SDValue SrcOp = Ops[i]; OneUseConstantOp |= SrcOp.hasOneUse(); if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i], RawBitsOps[i])) return SDValue(); } // Only fold if at least one of the constants is only used once or // the combined shuffle has included a variable mask shuffle, this // is to avoid constant pool bloat. if (!OneUseConstantOp && !HasVariableMask) return SDValue(); // Shuffle the constant bits according to the mask. APInt UndefElts(NumMaskElts, 0); APInt ZeroElts(NumMaskElts, 0); APInt ConstantElts(NumMaskElts, 0); SmallVector ConstantBitData(NumMaskElts, APInt::getNullValue(MaskSizeInBits)); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { UndefElts.setBit(i); continue; } else if (M == SM_SentinelZero) { ZeroElts.setBit(i); continue; } assert(0 <= M && M < (int)(NumMaskElts * NumOps)); unsigned SrcOpIdx = (unsigned)M / NumMaskElts; unsigned SrcMaskIdx = (unsigned)M % NumMaskElts; auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; if (SrcUndefElts[SrcMaskIdx]) { UndefElts.setBit(i); continue; } auto &SrcEltBits = RawBitsOps[SrcOpIdx]; APInt &Bits = SrcEltBits[SrcMaskIdx]; if (!Bits) { ZeroElts.setBit(i); continue; } ConstantElts.setBit(i); ConstantBitData[i] = Bits; } assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()); // Create the constant data. MVT MaskSVT; if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64)) MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits); else MaskSVT = MVT::getIntegerVT(MaskSizeInBits); MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts); SDLoc DL(Root); SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL); return DAG.getBitcast(VT, CstOp); } /// Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once /// they have been fully optimized, this will recursively consider all chains /// of single-use shuffle instructions, build a generic model of the cumulative /// shuffle operation, and check for simpler instructions which implement this /// operation. We use this primarily for two purposes: /// /// 1) Collapse generic shuffles to specialized single instructions when /// equivalent. In most cases, this is just an encoding size win, but /// sometimes we will collapse multiple generic shuffles into a single /// special-purpose shuffle. /// 2) Look for sequences of shuffle instructions with 3 or more total /// instructions, and replace them with the slightly more expensive SSSE3 /// PSHUFB instruction if available. We do this as the last combining step /// to ensure we avoid using PSHUFB if we can implement the shuffle with /// a suitable short sequence of other instructions. The PSHUFB will either /// use a register or have to read from memory and so is slightly (but only /// slightly) more expensive than the other shuffle instructions. /// /// Because this is inherently a quadratic operation (for each shuffle in /// a chain, we recurse up the chain), the depth is limited to 8 instructions. /// This should never be an issue in practice as the shuffle lowering doesn't /// produce sequences of more than 8 instructions. /// /// FIXME: We will currently miss some cases where the redundant shuffling /// would simplify under the threshold for PSHUFB formation because of /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, unsigned Depth, bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. const unsigned MaxRecursionDepth = 8; if (Depth > MaxRecursionDepth) return SDValue(); // Directly rip through bitcasts to find the underlying operand. SDValue Op = SrcOps[SrcOpIndex]; Op = peekThroughOneUseBitcasts(Op); MVT VT = Op.getSimpleValueType(); if (!VT.isVector()) return SDValue(); // Bail if we hit a non-vector. assert(Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"); assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. SmallVector OpMask; SmallVector OpInputs; if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) return SDValue(); // Add the inputs to the Ops list, avoiding duplicates. SmallVector Ops(SrcOps.begin(), SrcOps.end()); auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { // Attempt to find an existing match. SDValue InputBC = peekThroughBitcasts(Input); for (int i = 0, e = Ops.size(); i < e; ++i) if (InputBC == peekThroughBitcasts(Ops[i])) return i; // Match failed - should we replace an existing Op? if (InsertionPoint >= 0) { Ops[InsertionPoint] = Input; return InsertionPoint; } // Add to the end of the Ops list. Ops.push_back(Input); return Ops.size() - 1; }; SmallVector OpInputIdx; for (SDValue OpInput : OpInputs) OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger."); // This function can be performance-critical, so we rely on the power-of-2 // knowledge that we have about the mask sizes to replace div/rem ops with // bit-masks and shifts. assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"); assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"); unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size()); unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size()); unsigned MaskWidth = std::max(OpMask.size(), RootMask.size()); unsigned RootRatio = std::max(1, OpMask.size() >> RootMaskSizeLog2); unsigned OpRatio = std::max(1, RootMask.size() >> OpMaskSizeLog2); assert((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"); assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"); assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"); assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"); unsigned RootRatioLog2 = countTrailingZeros(RootRatio); unsigned OpRatioLog2 = countTrailingZeros(OpRatio); SmallVector Mask(MaskWidth, SM_SentinelUndef); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by the // root mask to get us all the way to the root value arrangement. The reason // for this order is that we are recursing up the operation chain. for (unsigned i = 0; i < MaskWidth; ++i) { unsigned RootIdx = i >> RootRatioLog2; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. Mask[i] = RootMask[RootIdx]; continue; } unsigned RootMaskedIdx = RootRatio == 1 ? RootMask[RootIdx] : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1)); // Just insert the scaled root mask value if it references an input other // than the SrcOp we're currently inserting. if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { Mask[i] = RootMaskedIdx; continue; } RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we // are using. Mask[i] = OpMask[OpIdx]; continue; } // Ok, we have non-zero lanes, map them through to one of the Op's inputs. unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx] : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1)); OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"); OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; Mask[i] = OpMaskedIdx; } // Handle the all undef/zero cases early. if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) return DAG.getUNDEF(Root.getValueType()); // TODO - should we handle the mixed zero/undef case as well? Just returning // a zero mask will lose information on undef elements possibly reducing // future combine possibilities. if (all_of(Mask, [](int Idx) { return Idx < 0; })) return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, SDLoc(Root)); // Remove unused/repeated shuffle source ops. resolveTargetShuffleInputsAndMask(Ops, Mask); assert(!Ops.empty() && "Shuffle with no inputs detected"); HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); // Update the list of shuffle nodes that have been combined so far. SmallVector CombinedNodes(SrcNodes.begin(), SrcNodes.end()); CombinedNodes.push_back(Op.getNode()); // See if we can recurse into each shuffle source op (if it's a target // shuffle). The source op should only be generally combined if it either has // a single use (i.e. current Op) or all its users have already been combined, // if not then we can still combine but should prevent generation of variable // shuffles to avoid constant pool bloat. // Don't recurse if we already have more source ops than we can combine in // the remaining recursion depth. if (Ops.size() < (MaxRecursionDepth - Depth)) { for (int i = 0, e = Ops.size(); i < e; ++i) { bool AllowVar = false; if (Ops[i].getNode()->hasOneUse() || SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) AllowVar = AllowVariableMask; if (SDValue Res = combineX86ShufflesRecursively( Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask, AllowVar, DAG, Subtarget)) return Res; } } // Attempt to constant fold all of the constant source ops. if (SDValue Cst = combineX86ShufflesConstants( Ops, Mask, Root, HasVariableMask, DAG, Subtarget)) return Cst; // We can only combine unary and binary shuffle mask cases. if (Ops.size() <= 2) { // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with sequential pairs of // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. SmallVector WidenedMask; while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { Mask = std::move(WidenedMask); } // Canonicalization of binary shuffle masks to improve pattern matching by // commuting the inputs. if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(Ops[0], Ops[1]); } // Finally, try to combine into a single shuffle instruction. return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, AllowVariableMask, DAG, Subtarget); } // If that failed and any input is extracted then try to combine as a // shuffle with the larger type. return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth, HasVariableMask, AllowVariableMask, DAG, Subtarget); } /// Helper entry wrapper to combineX86ShufflesRecursively. static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget); } /// Get the PSHUF-style mask from PSHUF node. /// /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 /// PSHUF-style masks that can be reused with such instructions. static SmallVector getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector Mask; SmallVector Ops; bool IsUnary; bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary); (void)HaveMask; assert(HaveMask); // If we have more than 128-bits, only the low 128-bits of shuffle mask // matter. Check that the upper masks are repeats and remove them. if (VT.getSizeInBits() > 128) { int LaneElts = 128 / VT.getScalarSizeInBits(); #ifndef NDEBUG for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) for (int j = 0; j < LaneElts; ++j) assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"); #endif Mask.resize(LaneElts); } switch (N.getOpcode()) { case X86ISD::PSHUFD: return Mask; case X86ISD::PSHUFLW: Mask.resize(4); return Mask; case X86ISD::PSHUFHW: Mask.erase(Mask.begin(), Mask.begin() + 4); for (int &M : Mask) M -= 4; return Mask; default: llvm_unreachable("No valid shuffle instruction found!"); } } /// Search for a combinable shuffle across a chain ending in pshufd. /// /// We walk up the chain and look for a combinable shuffle, skipping over /// shuffles that we could hoist this shuffle's transformation past without /// altering anything. static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, SelectionDAG &DAG) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); // Walk up a single-use chain looking for a combinable shuffle. Keep a stack // of the shuffles in the chain so that we can form a fresh chain to replace // this one. SmallVector Chain; SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: return SDValue(); // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific // instructions. continue; case X86ISD::PSHUFD: // Found another dword shuffle. break; case X86ISD::PSHUFLW: // Check that the low words (being shuffled) are the identity in the // dword shuffle, and the high words are self-contained. if (Mask[0] != 0 || Mask[1] != 1 || !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) return SDValue(); Chain.push_back(V); continue; case X86ISD::PSHUFHW: // Check that the high words (being shuffled) are the identity in the // dword shuffle, and the low words are self-contained. if (Mask[2] != 2 || Mask[3] != 3 || !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) return SDValue(); Chain.push_back(V); continue; case X86ISD::UNPCKL: case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && V.getSimpleValueType().getVectorElementType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. unsigned CombineOp = V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; if (V.getOperand(0) != V.getOperand(1) || !V->isOnlyUserOf(V.getOperand(0).getNode())) return SDValue(); Chain.push_back(V); V = V.getOperand(0); do { switch (V.getOpcode()) { default: return SDValue(); // Nothing to combine. case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOp) break; Chain.push_back(V); LLVM_FALLTHROUGH; case ISD::BITCAST: V = V.getOperand(0); continue; } break; } while (V.hasOneUse()); break; } // Break out of the loop if we break out of the switch. break; } if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. return SDValue(); // Merge this node's mask and our incoming mask. SmallVector VMask = getPSHUFShuffleMask(V); for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Rebuild the chain around this new shuffle. while (!Chain.empty()) { SDValue W = Chain.pop_back_val(); if (V.getValueType() != W.getOperand(0).getValueType()) V = DAG.getBitcast(W.getOperand(0).getValueType(), V); switch (W.getOpcode()) { default: llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); case X86ISD::UNPCKL: case X86ISD::UNPCKH: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); break; case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); break; } } if (V.getValueType() != N.getValueType()) V = DAG.getBitcast(N.getValueType(), V); // Return the new chain to replace N. return V; } /// Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); MVT VT = N.getSimpleValueType(); SmallVector Mask; unsigned Opcode = N.getOpcode(); // Combine binary shuffle of 2 similar 'Horizontal' instructions into a // single instruction. if (VT.getScalarSizeInBits() == 64 && (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH || Opcode == X86ISD::UNPCKL)) { auto BC0 = peekThroughBitcasts(N.getOperand(0)); auto BC1 = peekThroughBitcasts(N.getOperand(1)); EVT VT0 = BC0.getValueType(); EVT VT1 = BC1.getValueType(); unsigned Opcode0 = BC0.getOpcode(); unsigned Opcode1 = BC1.getOpcode(); if (Opcode0 == Opcode1 && VT0 == VT1 && (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) { SDValue Lo, Hi; if (Opcode == X86ISD::MOVSD) { Lo = BC1.getOperand(0); Hi = BC0.getOperand(1); } else { Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0); Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0); } SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi); return DAG.getBitcast(VT, Horiz); } } switch (Opcode) { case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); SDValue BC = peekThroughBitcasts(Src); EVT SrcVT = Src.getValueType(); EVT BCVT = BC.getValueType(); // If broadcasting from another shuffle, attempt to simplify it. // TODO - we really need a general SimplifyDemandedVectorElts mechanism. if (isTargetShuffle(BC.getOpcode()) && VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) { unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits(); SmallVector DemandedMask(BCVT.getVectorNumElements(), SM_SentinelUndef); for (unsigned i = 0; i != Scale; ++i) DemandedMask[i] = i; if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); } // broadcast(bitcast(src)) -> bitcast(broadcast(src)) // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward. if (Src.getOpcode() == ISD::BITCAST && SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) { EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(), VT.getVectorNumElements()); return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC)); } // Reduce broadcast source vector to lowest 128-bits. if (SrcVT.getSizeInBits() > 128) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, extract128BitVector(Src, 0, DAG, DL)); // broadcast(scalar_to_vector(x)) -> broadcast(x). if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); // Share broadcast with the longest vector and extract low subvector (free). for (SDNode *User : Src->uses()) if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && User->getValueSizeInBits(0) > VT.getSizeInBits()) { return extractSubVector(SDValue(User, 0), 0, DAG, DL, VT.getSizeInBits()); } return SDValue(); } case X86ISD::BLENDI: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. // TODO: Handle MVT::v16i16 repeated blend mask. if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { MVT SrcVT = N0.getOperand(0).getSimpleValueType(); if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && SrcVT.getScalarSizeInBits() >= 32) { unsigned Mask = N.getConstantOperandVal(2); unsigned Size = VT.getVectorNumElements(); unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), N1.getOperand(0), DAG.getConstant(ScaleMask, DL, MVT::i8))); } } return SDValue(); } case X86ISD::VPERMI: { // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements. // TODO: Remove when we have preferred domains in combineX86ShuffleChain. SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); unsigned EltSizeInBits = VT.getScalarSizeInBits(); if (N0.getOpcode() == ISD::BITCAST && N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) { SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1); return DAG.getBitcast(VT, Res); } return SDValue(); } case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: Mask = getPSHUFShuffleMask(N); assert(Mask.size() == 4); break; case X86ISD::MOVSD: case X86ISD::MOVSS: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); // Canonicalize scalar FPOps: // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0]))) // If commutable, allow OP(N1[0], N0[0]). unsigned Opcode1 = N1.getOpcode(); if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB || Opcode1 == ISD::FDIV) { SDValue N10 = N1.getOperand(0); SDValue N11 = N1.getOperand(1); if (N10 == N0 || (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) { if (N10 != N0) std::swap(N10, N11); MVT SVT = VT.getVectorElementType(); SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL); N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx); N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx); SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11); SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); return DAG.getNode(Opcode, DL, VT, N0, SclVec); } } return SDValue(); } case X86ISD::INSERTPS: { assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); SDValue Op0 = N.getOperand(0); SDValue Op1 = N.getOperand(1); SDValue Op2 = N.getOperand(2); unsigned InsertPSMask = cast(Op2)->getZExtValue(); unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; unsigned DstIdx = (InsertPSMask >> 4) & 0x3; unsigned ZeroMask = InsertPSMask & 0xF; // If we zero out all elements from Op0 then we don't need to reference it. if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1, DAG.getConstant(InsertPSMask, DL, MVT::i8)); // If we zero out the element from Op1 then we don't need to reference it. if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getConstant(InsertPSMask, DL, MVT::i8)); // Attempt to merge insertps Op1 with an inner target shuffle node. SmallVector TargetMask1; SmallVector Ops1; if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) { int M = TargetMask1[SrcIdx]; if (isUndefOrZero(M)) { // Zero/UNDEF insertion - zero out element and remove dependency. InsertPSMask |= (1u << DstIdx); return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getConstant(InsertPSMask, DL, MVT::i8)); } // Update insertps mask srcidx and reference the source input directly. assert(0 <= M && M < 8 && "Shuffle index out of range"); InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); Op1 = Ops1[M < 4 ? 0 : 1]; return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, DAG.getConstant(InsertPSMask, DL, MVT::i8)); } // Attempt to merge insertps Op0 with an inner target shuffle node. SmallVector TargetMask0; SmallVector Ops0; if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) return SDValue(); bool Updated = false; bool UseInput00 = false; bool UseInput01 = false; for (int i = 0; i != 4; ++i) { int M = TargetMask0[i]; if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { // No change if element is already zero or the inserted element. continue; } else if (isUndefOrZero(M)) { // If the target mask is undef/zero then we must zero the element. InsertPSMask |= (1u << i); Updated = true; continue; } // The input vector element must be inline. if (M != i && M != (i + 4)) return SDValue(); // Determine which inputs of the target shuffle we're using. UseInput00 |= (0 <= M && M < 4); UseInput01 |= (4 <= M); } // If we're not using both inputs of the target shuffle then use the // referenced input directly. if (UseInput00 && !UseInput01) { Updated = true; Op0 = Ops0[0]; } else if (!UseInput00 && UseInput01) { Updated = true; Op0 = Ops0[1]; } if (Updated) return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, DAG.getConstant(InsertPSMask, DL, MVT::i8)); return SDValue(); } default: return SDValue(); } // Nuke no-op shuffles that show up after combining. if (isNoopShuffleMask(Mask)) return N.getOperand(0); // Look for simplifications involving one or two shuffle instructions. SDValue V = N.getOperand(0); switch (N.getOpcode()) { default: break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); // See if this reduces to a PSHUFD which is no more expensive and can // combine with more operations. Note that it has to at least flip the // dwords as otherwise it would have been removed as a no-op. if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); V = DAG.getBitcast(DVT, V); V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); return DAG.getBitcast(VT, V); } // Look for shuffle patterns which can be implemented as a single unpack. // FIXME: This doesn't handle the location of the PSHUFD generically, and // only works when we have a PSHUFD followed by two half-shuffles. if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && (V.getOpcode() == X86ISD::PSHUFLW || V.getOpcode() == X86ISD::PSHUFHW) && V.getOpcode() != N.getOpcode() && V.hasOneUse()) { SDValue D = peekThroughOneUseBitcasts(V.getOperand(0)); if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { SmallVector VMask = getPSHUFShuffleMask(V); SmallVector DMask = getPSHUFShuffleMask(D); int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int WordMask[8]; for (int i = 0; i < 4; ++i) { WordMask[i + NOffset] = Mask[i] + NOffset; WordMask[i + VOffset] = VMask[i] + VOffset; } // Map the word mask through the DWord mask. int MappedMask[8]; for (int i = 0; i < 8; ++i) MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { // We can replace all three shuffles with an unpack. V = DAG.getBitcast(VT, D.getOperand(0)); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, VT, V, V); } } } break; case X86ISD::PSHUFD: if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG)) return NewN; break; } return SDValue(); } /// Checks if the shuffle mask takes subsequent elements /// alternately from two vectors. /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct. static bool isAddSubOrSubAddMask(ArrayRef Mask, bool &Op0Even) { int ParitySrc[2] = {-1, -1}; unsigned Size = Mask.size(); for (unsigned i = 0; i != Size; ++i) { int M = Mask[i]; if (M < 0) continue; // Make sure we are using the matching element from the input. if ((M % Size) != i) return false; // Make sure we use the same input for all elements of the same parity. int Src = M / Size; if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src) return false; ParitySrc[i % 2] = Src; } // Make sure each input is used. if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1]) return false; Op0Even = ParitySrc[0] == 0; return true; } /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD) /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation /// are written to the parameters \p Opnd0 and \p Opnd1. /// /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes /// so it is easier to generically match. We also insert dummy vector shuffle /// nodes for the operands which explicitly discard the lanes which are unused /// by this operation to try to flow through the rest of the combiner /// the fact that they're unused. static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, bool &IsSubAdd) { EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) || !VT.getSimpleVT().isFloatingPoint()) return false; // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask // extraction tool to support more. if (N->getOpcode() != ISD::VECTOR_SHUFFLE) return false; SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); // Make sure we have an FADD and an FSUB. if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) || (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) || V1.getOpcode() == V2.getOpcode()) return false; // If there are other uses of these operations we can't fold them. if (!V1->hasOneUse() || !V2->hasOneUse()) return false; // Ensure that both operations have the same operands. Note that we can // commute the FADD operands. SDValue LHS, RHS; if (V1.getOpcode() == ISD::FSUB) { LHS = V1->getOperand(0); RHS = V1->getOperand(1); if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) return false; } else { assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode"); LHS = V2->getOperand(0); RHS = V2->getOperand(1); if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) && (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS)) return false; } ArrayRef Mask = cast(N)->getMask(); bool Op0Even; if (!isAddSubOrSubAddMask(Mask, Op0Even)) return false; // It's a subadd if the vector in the even parity is an FADD. IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD : V2->getOpcode() == ISD::FADD; Opnd0 = LHS; Opnd1 = RHS; return true; } /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd. static SDValue combineShuffleToFMAddSub(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask // extraction tool to support more. if (N->getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); MVT VT = N->getSimpleValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT)) return SDValue(); // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c). SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue FMAdd = Op0, FMSub = Op1; if (FMSub.getOpcode() != X86ISD::FMSUB) std::swap(FMAdd, FMSub); if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB || FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() || FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() || FMAdd.getOperand(2) != FMSub.getOperand(2)) return SDValue(); // Check for correct shuffle mask. ArrayRef Mask = cast(N)->getMask(); bool Op0Even; if (!isAddSubOrSubAddMask(Mask, Op0Even)) return SDValue(); // FMAddSub takes zeroth operand from FMSub node. SDLoc DL(N); bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd; unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1), FMAdd.getOperand(2)); } /// Try to combine a shuffle into a target-specific add-sub or /// mul-add-sub node. static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG)) return V; SDValue Opnd0, Opnd1; bool IsSubAdd; if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd)) return SDValue(); MVT VT = N->getSimpleValueType(0); SDLoc DL(N); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } if (IsSubAdd) return SDValue(); // Do not generate X86ISD::ADDSUB node for 512-bit types even though // the ADDSUB idiom has been successfully recognized. There are no known // X86 targets with 512-bit ADDSUB instructions! if (VT.is512BitVector()) return SDValue(); return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } // We are looking for a shuffle where both sources are concatenated with undef // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so // if we can express this as a single-source shuffle, that's preferable. static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX2() || !isa(N)) return SDValue(); EVT VT = N->getValueType(0); // We only care about shuffles of 128/256-bit vectors of 32/64-bit values. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); if (VT.getVectorElementType() != MVT::i32 && VT.getVectorElementType() != MVT::i64 && VT.getVectorElementType() != MVT::f32 && VT.getVectorElementType() != MVT::f64) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Check that both sources are concats with undef. if (N0.getOpcode() != ISD::CONCAT_VECTORS || N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef()) return SDValue(); // Construct the new shuffle mask. Elements from the first source retain their // index, but elements from the second source no longer need to skip an undef. SmallVector Mask; int NumElts = VT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast(N); for (int Elt : SVOp->getMask()) Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2)); SDLoc DL(N); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0), N1.getOperand(0)); return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask); } /// Eliminate a redundant shuffle of a horizontal math op. static SDValue foldShuffleOfHorizOp(SDNode *N) { unsigned Opcode = N->getOpcode(); if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST) if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) return SDValue(); // For a broadcast, peek through an extract element of index 0 to find the // horizontal op: broadcast (ext_vec_elt HOp, 0) EVT VT = N->getValueType(0); if (Opcode == X86ISD::VBROADCAST) { SDValue SrcOp = N->getOperand(0); if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT && SrcOp.getValueType() == MVT::f64 && SrcOp.getOperand(0).getValueType() == VT && isNullConstant(SrcOp.getOperand(1))) N = SrcOp.getNode(); } SDValue HOp = N->getOperand(0); if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD && HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB) return SDValue(); // 128-bit horizontal math instructions are defined to operate on adjacent // lanes of each operand as: // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3] // ...similarly for v2f64 and v8i16. if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() && HOp.getOperand(0) != HOp.getOperand(1)) return SDValue(); // When the operands of a horizontal math op are identical, the low half of // the result is the same as the high half. If a target shuffle is also // replicating low and high halves, we don't need the shuffle. if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) { if (HOp.getScalarValueSizeInBits() == 64) { // movddup (hadd X, X) --> hadd X, X // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X assert((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT && "Unexpected type for h-op"); return HOp; } return SDValue(); } // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X ArrayRef Mask = cast(N)->getMask(); // TODO: Other mask possibilities like {1,1} and {1,0} could be added here, // but this should be tied to whatever horizontal op matching and shuffle // canonicalization are producing. if (HOp.getValueSizeInBits() == 128 && (isTargetShuffleEquivalent(Mask, {0, 0}) || isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) || isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}))) return HOp; if (HOp.getValueSizeInBits() == 256 && (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) || isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) || isTargetShuffleEquivalent( Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11}))) return HOp; return SDValue(); } /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the /// low half of each source vector and does not set any high half elements in /// the destination vector, narrow the shuffle to half its original size. static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { if (!Shuf->getValueType(0).isSimple()) return SDValue(); MVT VT = Shuf->getSimpleValueType(0); if (!VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); // See if we can ignore all of the high elements of the shuffle. ArrayRef Mask = Shuf->getMask(); if (!isUndefUpperHalf(Mask)) return SDValue(); // Check if the shuffle mask accesses only the low half of each input vector // (half-index output is 0 or 2). int HalfIdx1, HalfIdx2; SmallVector HalfMask(Mask.size() / 2); if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) || (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1)) return SDValue(); // Create a half-width shuffle to replace the unnecessarily wide shuffle. // The trick is knowing that all of the insert/extract are actually free // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle // of narrow inputs into a narrow output, and that is always cheaper than // the wide shuffle that we started with. return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0), Shuf->getOperand(1), HalfMask, HalfIdx1, HalfIdx2, false, DAG); } static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (auto *Shuf = dyn_cast(N)) if (SDValue V = narrowShuffle(Shuf, DAG)) return V; // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. SDLoc dl(N); EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; if (SDValue HAddSub = foldShuffleOfHorizOp(N)) return HAddSub; } // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. // // This code performs the following transformation: // fold: (shuffle (bitcast (BINOP A, B)), Undef, ) -> // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, ) // // We do this only if both the bitcast and the BINOP dag nodes have // one use. Also, perform this transformation only if the new binary // operation is legal. This is to avoid introducing dag nodes that // potentially need to be further expanded (or custom lowered) into a // less optimal sequence of dag nodes. if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::VECTOR_SHUFFLE && N->getOperand(0).getOpcode() == ISD::BITCAST && N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue BC0 = N0.getOperand(0); EVT SVT = BC0.getValueType(); unsigned Opcode = BC0.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); if (BC0.hasOneUse() && SVT.isVector() && SVT.getVectorNumElements() * 2 == NumElts && TLI.isOperationLegal(Opcode, VT)) { bool CanFold = false; switch (Opcode) { default : break; case ISD::ADD: case ISD::SUB: case ISD::MUL: // isOperationLegal lies for integer ops on floating point types. CanFold = VT.isInteger(); break; case ISD::FADD: case ISD::FSUB: case ISD::FMUL: // isOperationLegal lies for floating point ops on integer types. CanFold = VT.isFloatingPoint(); break; } unsigned SVTNumElts = SVT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast(N); for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) CanFold = SVOp->getMaskElt(i) == (int)(i * 2); for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) CanFold = SVOp->getMaskElt(i) < 0; if (CanFold) { SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask()); } } } // Attempt to combine into a vector load/broadcast. if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true)) return LD; // For AVX2, we sometimes want to combine // (vector_shuffle (concat_vectors t1, undef) // (concat_vectors t2, undef)) // Into: // (vector_shuffle (concat_vectors t1, t2), undef) // Since the latter can be efficiently lowered with VPERMD/VPERMQ if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget)) return ShufConcat; if (isTargetShuffle(N->getOpcode())) { SDValue Op(N, 0); if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget)) return Shuffle; // Try recursively combining arbitrary sequences of x86 shuffle // instructions into higher-order shuffles. We do this after combining // specific PSHUF instruction sequences into their minimal form so that we // can evaluate how many specialized shuffle instructions are involved in // a particular chain. if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; // Simplify source operands based on shuffle mask. // TODO - merge this into combineX86ShufflesRecursively. APInt KnownUndef, KnownZero; APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI)) return SDValue(N, 0); } // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros // in the upper 64 bits. // TODO: Can we generalize this using computeKnownBits. if (N->getOpcode() == X86ISD::VZEXT_MOVL && (VT == MVT::v2f64 || VT == MVT::v2i64) && N->getOperand(0).getOpcode() == ISD::BITCAST && (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 || N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) { SDValue In = N->getOperand(0).getOperand(0); switch (In.getOpcode()) { default: break; case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI: case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI: case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P: case X86ISD::VFPROUND: case X86ISD::VMFPROUND: if (In.getOperand(0).getValueType() == MVT::v2f64 || In.getOperand(0).getValueType() == MVT::v2i64) return N->getOperand(0); // return the bitcast break; } } // Pull subvector inserts into undef through VZEXT_MOVL by making it an // insert into a zero vector. This helps get VZEXT_MOVL closer to // scalar_to_vectors where 256/512 are canonicalized to an insert and a // 128-bit scalar_to_vector. This reduces the number of isel patterns. if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() && N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR && N->getOperand(0).hasOneUse() && N->getOperand(0).getOperand(0).isUndef() && isNullConstant(N->getOperand(0).getOperand(2))) { SDValue In = N->getOperand(0).getOperand(1); SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl), Movl, N->getOperand(0).getOperand(2)); } // If this a vzmovl of a full vector load, replace it with a vzload, unless // the load is volatile. if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() && ISD::isNormalLoad(N->getOperand(0).getNode())) { LoadSDNode *LN = cast(N->getOperand(0)); if (!LN->isVolatile()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue VZLoad = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, VT.getVectorElementType(), LN->getPointerInfo(), LN->getAlignment(), MachineMemOperand::MOLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); return VZLoad; } } // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. // FIXME: This can probably go away once we default to widening legalization. if (Subtarget.hasSSE41() && VT == MVT::v4i32 && N->getOpcode() == ISD::VECTOR_SHUFFLE && N->getOperand(0).getOpcode() == ISD::BITCAST && N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) { SDValue BC = N->getOperand(0); SDValue MULUDQ = BC.getOperand(0); ShuffleVectorSDNode *SVOp = cast(N); ArrayRef Mask = SVOp->getMask(); if (BC.hasOneUse() && MULUDQ.hasOneUse() && Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) { SDValue Op0 = MULUDQ.getOperand(0); SDValue Op1 = MULUDQ.getOperand(1); if (Op0.getOpcode() == ISD::BITCAST && Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && Op0.getOperand(0).getValueType() == MVT::v4i32) { ShuffleVectorSDNode *SVOp0 = cast(Op0.getOperand(0)); ArrayRef Mask2 = SVOp0->getMask(); if (Mask2[0] == 0 && Mask2[1] == -1 && Mask2[2] == 1 && Mask2[3] == -1) { Op0 = SVOp0->getOperand(0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask); return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); } } if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && Op1.getOperand(0).getValueType() == MVT::v4i32) { ShuffleVectorSDNode *SVOp1 = cast(Op1.getOperand(0)); ArrayRef Mask2 = SVOp1->getMask(); if (Mask2[0] == 0 && Mask2[1] == -1 && Mask2[2] == 1 && Mask2[3] == -1) { Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask); Op1 = SVOp1->getOperand(0); return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); } } } } return SDValue(); } bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const { int NumElts = DemandedElts.getBitWidth(); unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); // Handle special case opcodes. switch (Opc) { case X86ISD::PMULDQ: case X86ISD::PMULUDQ: { APInt LHSUndef, LHSZero; APInt RHSUndef, RHSZero; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, Depth + 1)) return true; if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, Depth + 1)) return true; // Multiply by zero. KnownZero = LHSZero | RHSZero; break; } case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: { // We only need the bottom 64-bits of the (128-bit) shift amount. SDValue Amt = Op.getOperand(1); MVT AmtVT = Amt.getSimpleValueType(); assert(AmtVT.is128BitVector() && "Unexpected value type"); // If we reuse the shift amount just for sse shift amounts then we know that // only the bottom 64-bits are only ever used. bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) { unsigned UseOpc = Use->getOpcode(); return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL || UseOpc == X86ISD::VSRA) && Use->getOperand(0) != Amt; }); APInt AmtUndef, AmtZero; unsigned NumAmtElts = AmtVT.getVectorNumElements(); APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2); if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO, Depth + 1, AssumeSingleUse)) return true; LLVM_FALLTHROUGH; } case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); APInt SrcUndef; if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO, Depth + 1)) return true; // TODO convert SrcUndef to KnownUndef. break; } case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); APInt SrcUndef, SrcZero; APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; break; } case X86ISD::PACKSS: case X86ISD::PACKUS: { APInt DemandedLHS, DemandedRHS; getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef, SrcZero, TLO, Depth + 1)) return true; if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef, SrcZero, TLO, Depth + 1)) return true; break; } case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: case X86ISD::FHSUB: { APInt DemandedLHS, DemandedRHS; getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); APInt LHSUndef, LHSZero; if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef, LHSZero, TLO, Depth + 1)) return true; APInt RHSUndef, RHSZero; if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef, RHSZero, TLO, Depth + 1)) return true; break; } case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO, Depth + 1)) return true; KnownZero = SrcZero.zextOrTrunc(NumElts); KnownUndef = SrcUndef.zextOrTrunc(NumElts); break; } case X86ISD::BLENDV: { APInt SelUndef, SelZero; if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef, SelZero, TLO, Depth + 1)) return true; // TODO: Use SelZero to adjust LHS/RHS DemandedElts. APInt LHSUndef, LHSZero; if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef, LHSZero, TLO, Depth + 1)) return true; APInt RHSUndef, RHSZero; if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef, RHSZero, TLO, Depth + 1)) return true; KnownZero = LHSZero & RHSZero; KnownUndef = LHSUndef & RHSUndef; break; } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); if (!SrcVT.isVector()) return false; // Don't bother broadcasting if we just need the 0'th element. if (DemandedElts == 1) { if (Src.getValueType() != VT) Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG, SDLoc(Op)); return TLO.CombineTo(Op, Src); } APInt SrcUndef, SrcZero; APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; break; } case X86ISD::VPERMV: { SDValue Mask = Op.getOperand(0); APInt MaskUndef, MaskZero; if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, Depth + 1)) return true; break; } case X86ISD::PSHUFB: case X86ISD::VPERMV3: case X86ISD::VPERMILPV: { SDValue Mask = Op.getOperand(1); APInt MaskUndef, MaskZero; if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, Depth + 1)) return true; break; } case X86ISD::VPPERM: case X86ISD::VPERMIL2: { SDValue Mask = Op.getOperand(2); APInt MaskUndef, MaskZero; if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, Depth + 1)) return true; break; } } // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not // demand any of the high elements, then narrow the op to 128/256-bits: e.g. // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0 if ((VT.is256BitVector() || VT.is512BitVector()) && DemandedElts.lshr(NumElts / 2) == 0) { unsigned SizeInBits = VT.getSizeInBits(); unsigned ExtSizeInBits = SizeInBits / 2; // See if 512-bit ops only use the bottom 128-bits. if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0) ExtSizeInBits = SizeInBits / 4; switch (Opc) { // Zero upper elements. case X86ISD::VZEXT_MOVL: { SDLoc DL(Op); SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0); SDValue UndefVec = TLO.DAG.getUNDEF(VT); SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); return TLO.CombineTo(Op, Insert); } // Subvector broadcast. case X86ISD::SUBV_BROADCAST: { SDLoc DL(Op); SDValue Src = Op.getOperand(0); if (Src.getValueSizeInBits() > ExtSizeInBits) Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits); else if (Src.getValueSizeInBits() < ExtSizeInBits) { MVT SrcSVT = Src.getSimpleValueType().getScalarType(); MVT SrcVT = MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits()); Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src); } return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0, TLO.DAG, DL, ExtSizeInBits)); } // Byte shifts by immediate. case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: // Shift by uniform. case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: // Shift by immediate. case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: { SDLoc DL(Op); SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1)); SDValue UndefVec = TLO.DAG.getUNDEF(VT); SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); return TLO.CombineTo(Op, Insert); } case X86ISD::VPERMI: { // Simplify PERMPD/PERMQ to extract_subvector. // TODO: This should be done in shuffle combining. if (VT == MVT::v4f64 || VT == MVT::v4i64) { SmallVector Mask; DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask); if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) { SDLoc DL(Op); SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128); SDValue UndefVec = TLO.DAG.getUNDEF(VT); SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128); return TLO.CombineTo(Op, Insert); } } break; } // Target Shuffles. case X86ISD::PSHUFB: case X86ISD::UNPCKL: case X86ISD::UNPCKH: // Saturated Packs. case X86ISD::PACKSS: case X86ISD::PACKUS: // Horizontal Ops. case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: case X86ISD::FHSUB: { SDLoc DL(Op); MVT ExtVT = VT.getSimpleVT(); ExtVT = MVT::getVectorVT(ExtVT.getScalarType(), ExtSizeInBits / ExtVT.getScalarSizeInBits()); SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); SDValue Ext1 = extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits); SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1); SDValue UndefVec = TLO.DAG.getUNDEF(VT); SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); return TLO.CombineTo(Op, Insert); } } } // Simplify target shuffles. if (!isTargetShuffle(Opc) || !VT.isSimple()) return false; // Get target shuffle mask. bool IsUnary; SmallVector OpMask; SmallVector OpInputs; if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs, OpMask, IsUnary)) return false; // Shuffle inputs must be the same type as the result. if (llvm::any_of(OpInputs, [VT](SDValue V) { return VT != V.getValueType(); })) return false; // Clear known elts that might have been set above. KnownZero.clearAllBits(); KnownUndef.clearAllBits(); // Check if shuffle mask can be simplified to undef/zero/identity. int NumSrcs = OpInputs.size(); for (int i = 0; i != NumElts; ++i) { int &M = OpMask[i]; if (!DemandedElts[i]) M = SM_SentinelUndef; else if (0 <= M && OpInputs[M / NumElts].isUndef()) M = SM_SentinelUndef; } if (isUndefInRange(OpMask, 0, NumElts)) { KnownUndef.setAllBits(); return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); } if (isUndefOrZeroInRange(OpMask, 0, NumElts)) { KnownZero.setAllBits(); return TLO.CombineTo( Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); } for (int Src = 0; Src != NumSrcs; ++Src) if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts)) return TLO.CombineTo(Op, OpInputs[Src]); // Attempt to simplify inputs. for (int Src = 0; Src != NumSrcs; ++Src) { int Lo = Src * NumElts; APInt SrcElts = APInt::getNullValue(NumElts); for (int i = 0; i != NumElts; ++i) if (DemandedElts[i]) { int M = OpMask[i] - Lo; if (0 <= M && M < NumElts) SrcElts.setBit(M); } APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; } // Extract known zero/undef elements. // TODO - Propagate input undef/zero elts. for (int i = 0; i != NumElts; ++i) { if (OpMask[i] == SM_SentinelUndef) KnownUndef.setBit(i); if (OpMask[i] == SM_SentinelZero) KnownZero.setBit(i); } return false; } bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const { EVT VT = Op.getValueType(); unsigned BitWidth = OriginalDemandedBits.getBitWidth(); unsigned Opc = Op.getOpcode(); switch(Opc) { case X86ISD::PMULDQ: case X86ISD::PMULUDQ: { // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. KnownBits KnownOp; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); // FIXME: Can we bound this better? APInt DemandedMask = APInt::getLowBitsSet(64, 32); if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp, TLO, Depth + 1)) return true; if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp, TLO, Depth + 1)) return true; break; } case X86ISD::VSHLI: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (auto *ShiftImm = dyn_cast(Op1)) { if (ShiftImm->getAPIntValue().uge(BitWidth)) break; unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a // single shift. We can do this if the bottom bits (which are shifted // out) are never demanded. if (Op0.getOpcode() == X86ISD::VSRLI && OriginalDemandedBits.countTrailingZeros() >= ShAmt) { if (auto *Shift2Imm = dyn_cast(Op0.getOperand(1))) { if (Shift2Imm->getAPIntValue().ult(BitWidth)) { int Diff = ShAmt - Shift2Imm->getZExtValue(); if (Diff == 0) return TLO.CombineTo(Op, Op0.getOperand(0)); unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; SDValue NewShift = TLO.DAG.getNode( NewOpc, SDLoc(Op), VT, Op0.getOperand(0), TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); return TLO.CombineTo(Op, NewShift); } } } if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero <<= ShAmt; Known.One <<= ShAmt; // Low bits known zero. Known.Zero.setLowBits(ShAmt); } break; } case X86ISD::VSRLI: { if (auto *ShiftImm = dyn_cast(Op.getOperand(1))) { if (ShiftImm->getAPIntValue().uge(BitWidth)) break; unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits << ShAmt; if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero.lshrInPlace(ShAmt); Known.One.lshrInPlace(ShAmt); // High bits known zero. Known.Zero.setHighBits(ShAmt); } break; } case X86ISD::VSRAI: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (auto *ShiftImm = dyn_cast(Op1)) { if (ShiftImm->getAPIntValue().uge(BitWidth)) break; unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits << ShAmt; // If we just want the sign bit then we don't need to shift it. if (OriginalDemandedBits.isSignMask()) return TLO.CombineTo(Op, Op0); // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) { SDValue Op00 = Op0.getOperand(0); unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts); if (ShAmt < NumSignBits) return TLO.CombineTo(Op, Op00); } // If any of the demanded bits are produced by the sign extension, we also // demand the input sign bit. if (OriginalDemandedBits.countLeadingZeros() < ShAmt) DemandedMask.setSignBit(); if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero.lshrInPlace(ShAmt); Known.One.lshrInPlace(ShAmt); // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. if (Known.Zero[BitWidth - ShAmt - 1] || OriginalDemandedBits.countLeadingZeros() >= ShAmt) return TLO.CombineTo( Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1)); // High bits are known one. if (Known.One[BitWidth - ShAmt - 1]) Known.One.setHighBits(ShAmt); } break; } case X86ISD::PEXTRB: case X86ISD::PEXTRW: { SDValue Vec = Op.getOperand(0); auto *CIdx = dyn_cast(Op.getOperand(1)); MVT VecVT = Vec.getSimpleValueType(); unsigned NumVecElts = VecVT.getVectorNumElements(); if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) { unsigned Idx = CIdx->getZExtValue(); unsigned VecBitWidth = VecVT.getScalarSizeInBits(); // If we demand no bits from the vector then we must have demanded // bits from the implict zext - simplify to zero. APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth); if (DemandedVecBits == 0) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); APInt KnownUndef, KnownZero; APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx); if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef, KnownZero, TLO, Depth + 1)) return true; KnownBits KnownVec; if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, KnownVec, TLO, Depth + 1)) return true; Known = KnownVec.zext(BitWidth, true); return false; } break; } case X86ISD::PINSRB: case X86ISD::PINSRW: { SDValue Vec = Op.getOperand(0); SDValue Scl = Op.getOperand(1); auto *CIdx = dyn_cast(Op.getOperand(2)); MVT VecVT = Vec.getSimpleValueType(); if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) { unsigned Idx = CIdx->getZExtValue(); if (!OriginalDemandedElts[Idx]) return TLO.CombineTo(Op, Vec); KnownBits KnownVec; APInt DemandedVecElts(OriginalDemandedElts); DemandedVecElts.clearBit(Idx); if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts, KnownVec, TLO, Depth + 1)) return true; KnownBits KnownScl; unsigned NumSclBits = Scl.getScalarValueSizeInBits(); APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits); if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1)) return true; KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits()); Known.One = KnownVec.One & KnownScl.One; Known.Zero = KnownVec.Zero & KnownScl.Zero; return false; } break; } case X86ISD::PACKSS: // PACKSS saturates to MIN/MAX integer values. So if we just want the // sign bit then we can just ask for the source operands sign bit. // TODO - add known bits handling. if (OriginalDemandedBits.isSignMask()) { APInt DemandedLHS, DemandedRHS; getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS); KnownBits KnownLHS, KnownRHS; APInt SignMask = APInt::getSignMask(BitWidth * 2); if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS, KnownLHS, TLO, Depth + 1)) return true; if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS, KnownRHS, TLO, Depth + 1)) return true; } // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support. break; case X86ISD::PCMPGT: // icmp sgt(0, R) == ashr(R, BitWidth-1). // iff we only need the sign bit then we can use R directly. if (OriginalDemandedBits.isSignMask() && ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) return TLO.CombineTo(Op, Op.getOperand(1)); break; case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); unsigned SrcBits = SrcVT.getScalarSizeInBits(); unsigned NumElts = SrcVT.getVectorNumElements(); // If we don't need the sign bits at all just return zero. if (OriginalDemandedBits.countTrailingZeros() >= NumElts) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); // Only demand the vector elements of the sign bits we need. APInt KnownUndef, KnownZero; APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts); if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, TLO, Depth + 1)) return true; Known.Zero = KnownZero.zextOrSelf(BitWidth); Known.Zero.setHighBits(BitWidth - NumElts); // MOVMSK only uses the MSB from each vector element. KnownBits KnownSrc; if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts, KnownSrc, TLO, Depth + 1)) return true; if (KnownSrc.One[SrcBits - 1]) Known.One.setLowBits(NumElts); else if (KnownSrc.Zero[SrcBits - 1]) Known.Zero.setLowBits(NumElts); return false; } } return TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } /// Check if a vector extract from a target-specific shuffle of a load can be /// folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but /// shuffles have been custom lowered so we need to handle those here. static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue InVec = N->getOperand(0); SDValue EltNo = N->getOperand(1); EVT EltVT = N->getValueType(0); if (!isa(EltNo)) return SDValue(); EVT OriginalVT = InVec.getValueType(); // Peek through bitcasts, don't duplicate a load with other uses. InVec = peekThroughOneUseBitcasts(InVec); EVT CurrentVT = InVec.getValueType(); if (!CurrentVT.isVector() || CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); SmallVector ShuffleMask; SmallVector ShuffleOps; bool UnaryShuffle; if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, ShuffleOps, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast(EltNo)->getZExtValue(); int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; if (Idx == SM_SentinelZero) return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); if (Idx == SM_SentinelUndef) return DAG.getUNDEF(EltVT); // Bail if any mask element is SM_SentinelZero - getVectorShuffle below // won't handle it. if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; })) return SDValue(); assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1]; // If inputs to shuffle are the same for both ops, then allow 2 uses unsigned AllowedUses = (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) return SDValue(); AllowedUses = 1; // only allow 1 load use if we have a bitcast LdNode = LdNode.getOperand(0); } if (!ISD::isNormalLoad(LdNode.getNode())) return SDValue(); LoadSDNode *LN0 = cast(LdNode); if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); // If there's a bitcast before the shuffle, check if the load type and // alignment is valid. unsigned Align = LN0->getAlignment(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( EltVT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) return SDValue(); // All checks match so transform back to vector_shuffle so that DAG combiner // can finish the job SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1]; Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle, ShuffleMask); Shuffle = DAG.getBitcast(OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } // Helper to peek through bitops/setcc to determine size of source vector. // Allows combineBitcastvxi1 to determine what size vector generated a . static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { switch (Src.getOpcode()) { case ISD::SETCC: return Src.getOperand(0).getValueSizeInBits() == Size; case ISD::AND: case ISD::XOR: case ISD::OR: return checkBitcastSrcVectorSize(Src.getOperand(0), Size) && checkBitcastSrcVectorSize(Src.getOperand(1), Size); } return false; } // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> // (i16 movmsk (16i8 sext (v16i1 x))) // before the illegal vector is scalarized on subtargets that don't have legal // vxi1 types. static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget) { EVT SrcVT = Src.getValueType(); if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) return SDValue(); // If the input is a truncate from v16i8 or v32i8 go ahead and use a // movmskb even with avx512. This will be better than truncating to vXi1 and // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 // vpcmpeqb/vpcmpgtb. bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && (Src.getOperand(0).getValueType() == MVT::v16i8 || Src.getOperand(0).getValueType() == MVT::v32i8 || Src.getOperand(0).getValueType() == MVT::v64i8); // With AVX512 vxi1 types are legal and we prefer using k-regs. // MOVMSK is supported in SSE2 or later. if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated)) return SDValue(); // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and // v8f64. So all legal 128-bit and 256-bit vectors are covered except for // v8i16 and v16i16. // For these two cases, we can shuffle the upper element bytes to a // consecutive sequence at the start of the vector and treat the results as // v16i8 or v32i8, and for v16i8 this is the preferable solution. However, // for v16i16 this is not the case, because the shuffle is expensive, so we // avoid sign-extending to this type entirely. // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; switch (SrcVT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::v2i1: SExtVT = MVT::v2i64; break; case MVT::v4i1: SExtVT = MVT::v4i32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) SExtVT = MVT::v4i64; break; case MVT::v8i1: SExtVT = MVT::v8i16; // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)), // sign-extend to a 256-bit operation to match the compare. // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. // TODO : use checkBitcastSrcVectorSize if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() && (Src.getOperand(0).getValueType().is256BitVector() || Src.getOperand(0).getValueType().is512BitVector())) { SExtVT = MVT::v8i32; } break; case MVT::v16i1: SExtVT = MVT::v16i8; // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), // it is not profitable to sign-extend to 256-bit because this will // require an extra cross-lane shuffle which is more expensive than // truncating the result of the compare to 128-bits. break; case MVT::v32i1: SExtVT = MVT::v32i8; break; case MVT::v64i1: // If we have AVX512F, but not AVX512BW and the input is truncated from // v64i8 checked earlier. Then split the input and make two pmovmskbs. if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) { SExtVT = MVT::v64i8; break; } return SDValue(); }; SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); if (SExtVT == MVT::v64i8) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(V, DL); Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo); Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo); Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi); Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi); Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, DAG.getConstant(32, DL, MVT::i8)); V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); } else if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) { V = getPMOVMSKB(DL, V, DAG, Subtarget); } else { if (SExtVT == MVT::v8i16) V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V, DAG.getUNDEF(MVT::v8i16)); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); } EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); V = DAG.getZExtOrTrunc(V, DL, IntVT); return DAG.getBitcast(VT, V); } // Convert a vXi1 constant build vector to the same width scalar integer. static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) { EVT SrcVT = Op.getValueType(); assert(SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"); assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && "Expected a constant build vector"); APInt Imm(SrcVT.getVectorNumElements(), 0); for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) { SDValue In = Op.getOperand(Idx); if (!In.isUndef() && (cast(In)->getZExtValue() & 0x1)) Imm.setBit(Idx); } EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth()); return DAG.getConstant(Imm, SDLoc(Op), IntVT); } static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast"); if (!DCI.isBeforeLegalizeOps()) return SDValue(); // Only do this if we have k-registers. if (!Subtarget.hasAVX512()) return SDValue(); EVT DstVT = N->getValueType(0); SDValue Op = N->getOperand(0); EVT SrcVT = Op.getValueType(); if (!Op.hasOneUse()) return SDValue(); // Look for logic ops. if (Op.getOpcode() != ISD::AND && Op.getOpcode() != ISD::OR && Op.getOpcode() != ISD::XOR) return SDValue(); // Make sure we have a bitcast between mask registers and a scalar type. if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && DstVT.isScalarInteger()) && !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 && SrcVT.isScalarInteger())) return SDValue(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST && LHS.getOperand(0).getValueType() == DstVT) return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0), DAG.getBitcast(DstVT, RHS)); if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST && RHS.getOperand(0).getValueType() == DstVT) return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, DAG.getBitcast(DstVT, LHS), RHS.getOperand(0)); // If the RHS is a vXi1 build vector, this is a good reason to flip too. // Most of these have to move a constant from the scalar domain anyway. if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) { RHS = combinevXi1ConstantToInteger(RHS, DAG); return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, DAG.getBitcast(DstVT, LHS), RHS); } return SDValue(); } static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(BV); unsigned NumElts = BV->getNumOperands(); SDValue Splat = BV->getSplatValue(); // Build MMX element from integer GPR or SSE float values. auto CreateMMXElement = [&](SDValue V) { if (V.isUndef()) return DAG.getUNDEF(MVT::x86mmx); if (V.getValueType().isFloatingPoint()) { if (Subtarget.hasSSE1() && !isa(V)) { V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V); V = DAG.getBitcast(MVT::v2i64, V); return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V); } V = DAG.getBitcast(MVT::i32, V); } else { V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32); } return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V); }; // Convert build vector ops to MMX data in the bottom elements. SmallVector Ops; // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element. if (Splat) { if (Splat.isUndef()) return DAG.getUNDEF(MVT::x86mmx); Splat = CreateMMXElement(Splat); if (Subtarget.hasSSE1()) { // Unpack v8i8 to splat i8 elements to lowest 16-bits. if (NumElts == 8) Splat = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat, Splat); // Use PSHUFW to repeat 16-bit elements. unsigned ShufMask = (NumElts > 2 ? 0 : 0x44); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat, DAG.getConstant(ShufMask, DL, MVT::i8)); } Ops.append(NumElts, Splat); } else { for (unsigned i = 0; i != NumElts; ++i) Ops.push_back(CreateMMXElement(BV->getOperand(i))); } // Use tree of PUNPCKLs to build up general MMX vector. while (Ops.size() > 1) { unsigned NumOps = Ops.size(); unsigned IntrinOp = (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd : Intrinsic::x86_mmx_punpcklbw)); SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32); for (unsigned i = 0; i != NumOps; i += 2) Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin, Ops[i], Ops[i + 1]); Ops.resize(NumOps / 2); } return Ops[0]; } static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SrcVT = N0.getValueType(); // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> // (i16 movmsk (16i8 sext (v16i1 x))) // before the setcc result is scalarized on subtargets that don't have legal // vxi1 types. if (DCI.isBeforeLegalize()) { SDLoc dl(N); if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && Subtarget.hasAVX512()) { N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0); N0 = DAG.getBitcast(MVT::v8i1, N0); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0, DAG.getIntPtrConstant(0, dl)); } // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() && Subtarget.hasAVX512()) { unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); SmallVector Ops(NumConcats, DAG.getUNDEF(SrcVT)); Ops[0] = N0; N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); N0 = DAG.getBitcast(MVT::i8, N0); return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); } } // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. if (VT == MVT::x86mmx) { // Detect MMX constant vectors. APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) { SDLoc DL(N0); // Handle zero-extension of i32 with MOVD. if (EltBits[0].countLeadingZeros() >= 32) return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT, DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32)); // Else, bitcast to a double. // TODO - investigate supporting sext 32-bit immediates on x86_64. APFloat F64(APFloat::IEEEdouble(), EltBits[0]); return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64)); } // Detect bitcasts to x86mmx low word. if (N0.getOpcode() == ISD::BUILD_VECTOR && (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) && N0.getOperand(0).getValueType() == SrcVT.getScalarType()) { bool LowUndef = true, AllUndefOrZero = true; for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) { SDValue Op = N0.getOperand(i); LowUndef &= Op.isUndef() || (i >= e/2); AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op)); } if (AllUndefOrZero) { SDValue N00 = N0.getOperand(0); SDLoc dl(N00); N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32) : DAG.getZExtOrTrunc(N00, dl, MVT::i32); return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00); } } // Detect bitcasts of 64-bit build vectors and convert to a // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the // lowest element. if (N0.getOpcode() == ISD::BUILD_VECTOR && (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8)) return createMMXBuildVector(cast(N0), DAG, Subtarget); // Detect bitcasts between element or subvector extraction to x86mmx. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) && isNullConstant(N0.getOperand(1))) { SDValue N00 = N0.getOperand(0); if (N00.getValueType().is128BitVector()) return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT, DAG.getBitcast(MVT::v2i64, N00)); } // Detect bitcasts from FP_TO_SINT to x86mmx. if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) { SDLoc DL(N0); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, DAG.getUNDEF(MVT::v2i32)); return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT, DAG.getBitcast(MVT::v2i64, Res)); } } // Try to remove a bitcast of constant vXi1 vector. We have to legalize // most of these to scalar anyway. if (Subtarget.hasAVX512() && VT.isScalarInteger() && SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { return combinevXi1ConstantToInteger(N0, DAG); } if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() && VT.getVectorElementType() == MVT::i1 && isa(N0)) { auto *C = cast(N0); if (C->isAllOnesValue()) return DAG.getConstant(1, SDLoc(N0), VT); if (C->isNullValue()) return DAG.getConstant(0, SDLoc(N0), VT); } // Try to remove bitcasts from input and output of mask arithmetic to // remove GPR<->K-register crossings. if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget)) return V; // Convert a bitcasted integer logic operation that has one bitcasted // floating-point operand into a floating-point logic operation. This may // create a load of a constant, but that is cheaper than materializing the // constant in an integer register and transferring it to an SSE register or // transferring the SSE operand to integer register and back. unsigned FPOpcode; switch (N0.getOpcode()) { case ISD::AND: FPOpcode = X86ISD::FAND; break; case ISD::OR: FPOpcode = X86ISD::FOR; break; case ISD::XOR: FPOpcode = X86ISD::FXOR; break; default: return SDValue(); } if (!((Subtarget.hasSSE1() && VT == MVT::f32) || (Subtarget.hasSSE2() && VT == MVT::f64))) return SDValue(); SDValue LogicOp0 = N0.getOperand(0); SDValue LogicOp1 = N0.getOperand(1); SDLoc DL0(N0); // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y)) if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST && LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT && !isa(LogicOp0.getOperand(0))) { SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1); return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1); } // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y) if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST && LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT && !isa(LogicOp1.getOperand(0))) { SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0); return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0); } return SDValue(); } // Given a ABS node, detect the following pattern: // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))). // This is useful as it is the input into a SAD pattern. static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) { SDValue AbsOp1 = Abs->getOperand(0); if (AbsOp1.getOpcode() != ISD::SUB) return false; Op0 = AbsOp1.getOperand(0); Op1 = AbsOp1.getOperand(1); // Check if the operands of the sub are zero-extended from vectors of i8. if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || Op1.getOpcode() != ISD::ZERO_EXTEND || Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) return false; return true; } // Given two zexts of to , create a PSADBW of the inputs // to these zexts. static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget) { // Find the appropriate width for the PSADBW. EVT InVT = Zext0.getOperand(0).getValueType(); unsigned RegSize = std::max(128u, InVT.getSizeInBits()); // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we // fill in the missing vector elements with 0. unsigned NumConcat = RegSize / InVT.getSizeInBits(); SmallVector Ops(NumConcat, DAG.getConstant(0, DL, InVT)); Ops[0] = Zext0.getOperand(0); MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); Ops[0] = Zext1.getOperand(0); SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW. auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64); return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops); }; MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 }, PSADBWBuilder); } // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with // PHMINPOSUW. static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE41. if (!Subtarget.hasSSE41()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8) return SDValue(); // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. ISD::NodeType BinOp; SDValue Src = DAG.matchBinOpReduction( Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}); if (!Src) return SDValue(); EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getScalarType(); if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0) return SDValue(); SDLoc DL(Extract); SDValue MinPos = Src; // First, reduce the source down to 128-bit, applying BinOp to lo/hi. while (SrcVT.getSizeInBits() > 128) { unsigned NumElts = SrcVT.getVectorNumElements(); unsigned NumSubElts = NumElts / 2; SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts); unsigned SubSizeInBits = SrcVT.getSizeInBits(); SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits); SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits); MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); } assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && "Unexpected value type"); // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask // to flip the value accordingly. SDValue Mask; unsigned MaskEltsBits = ExtractVT.getSizeInBits(); if (BinOp == ISD::SMAX) Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::SMIN) Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::UMAX) Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); // For v16i8 cases we need to perform UMIN on pairs of byte elements, // shuffling each upper element down and insert zeros. This means that the // v16i8 UMIN will leave the upper element as zero, performing zero-extension // ready for the PHMINPOS. if (ExtractVT == MVT::i8) { SDValue Upper = DAG.getVectorShuffle( SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8), {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); } // Perform the PHMINPOS on a v8i16 vector, MinPos = DAG.getBitcast(MVT::v8i16, MinPos); MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos); MinPos = DAG.getBitcast(SrcVT, MinPos); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos, DAG.getIntPtrConstant(0, DL)); } // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK. static SDValue combineHorizontalPredicateResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE2. if (!Subtarget.hasSSE2()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); unsigned BitWidth = ExtractVT.getSizeInBits(); if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && ExtractVT != MVT::i8 && ExtractVT != MVT::i1) return SDValue(); // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns. ISD::NodeType BinOp; SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); if (!Match && ExtractVT == MVT::i1) Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR}); if (!Match) return SDValue(); // EXTRACT_VECTOR_ELT can require implicit extension of the vector element // which we can't support here for now. if (Match.getScalarValueSizeInBits() != BitWidth) return SDValue(); SDValue Movmsk; SDLoc DL(Extract); EVT MatchVT = Match.getValueType(); unsigned NumElts = MatchVT.getVectorNumElements(); if (ExtractVT == MVT::i1) { // Special case for (pre-legalization) vXi1 reductions. if (NumElts > 32) return SDValue(); if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) { // If this is a legal AVX512 predicate type then we can just bitcast. EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); Movmsk = DAG.getBitcast(MovmskVT, Match); } else { // Use combineBitcastvxi1 to create the MOVMSK. if (NumElts == 32 && !Subtarget.hasInt256()) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); NumElts = 16; } EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget); } if (!Movmsk) return SDValue(); Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32); } else { // Bail with AVX512VL (which uses predicate registers). if (Subtarget.hasVLX()) return SDValue(); unsigned MatchSizeInBits = Match.getValueSizeInBits(); if (!(MatchSizeInBits == 128 || (MatchSizeInBits == 256 && Subtarget.hasAVX()))) return SDValue(); // Make sure this isn't a vector of 1 element. The perf win from using // MOVMSK diminishes with less elements in the reduction, but it is // generally better to get the comparison over to the GPRs as soon as // possible to reduce the number of vector ops. if (Match.getValueType().getVectorNumElements() < 2) return SDValue(); // Check that we are extracting a reduction of all sign bits. if (DAG.ComputeNumSignBits(Match) != BitWidth) return SDValue(); if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); MatchSizeInBits = Match.getValueSizeInBits(); } // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. MVT MaskSrcVT; if (64 == BitWidth || 32 == BitWidth) MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), MatchSizeInBits / BitWidth); else MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match); Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget); NumElts = MaskSrcVT.getVectorNumElements(); } assert(NumElts <= 32 && "Not expecting more than 32 elements"); if (BinOp == ISD::XOR) { // parity -> (AND (CTPOP(MOVMSK X)), 1) SDValue Mask = DAG.getConstant(1, DL, MVT::i32); SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk); Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask); return DAG.getZExtOrTrunc(Result, DL, ExtractVT); } SDValue CmpC; ISD::CondCode CondCode; if (BinOp == ISD::OR) { // any_of -> MOVMSK != 0 CmpC = DAG.getConstant(0, DL, MVT::i32); CondCode = ISD::CondCode::SETNE; } else { // all_of -> MOVMSK == ((1 << NumElts) - 1) CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32); CondCode = ISD::CondCode::SETEQ; } // The setcc produces an i8 of 0/1, so extend that to the result width and // negate to get the final 0/-1 mask value. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode); SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT); SDValue Zero = DAG.getConstant(0, DL, ExtractVT); return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext); } static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. if (!Subtarget.hasSSE2()) return SDValue(); // Verify the type we're extracting from is any integer type above i16. EVT VT = Extract->getOperand(0).getValueType(); if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) return SDValue(); unsigned RegSize = 128; if (Subtarget.useBWIRegs()) RegSize = 512; else if (Subtarget.hasAVX()) RegSize = 256; // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. if (RegSize / VT.getVectorNumElements() < 8) return SDValue(); // Match shuffle + add pyramid. ISD::NodeType BinOp; SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD}); // The operand is expected to be zero extended from i8 // (verified in detectZextAbsDiff). // In order to convert to i64 and above, additional any/zero/sign // extend is expected. // The zero extend from 32 bit has no mathematical effect on the result. // Also the sign extend is basically zero extend // (extends the sign bit which is zero). // So it is correct to skip the sign/zero extend instruction. if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || Root.getOpcode() == ISD::ZERO_EXTEND || Root.getOpcode() == ISD::ANY_EXTEND)) Root = Root.getOperand(0); // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. if (!Root || Root.getOpcode() != ISD::ABS) return SDValue(); // Check whether we have an abs-diff pattern feeding into the select. SDValue Zext0, Zext1; if (!detectZextAbsDiff(Root, Zext0, Zext1)) return SDValue(); // Create the SAD instruction. SDLoc DL(Extract); SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget); // If the original vector was wider than 8 elements, sum over the results // in the SAD vector. unsigned Stages = Log2_32(VT.getVectorNumElements()); MVT SadVT = SAD.getSimpleValueType(); if (Stages > 3) { unsigned SadElems = SadVT.getVectorNumElements(); for(unsigned i = Stages - 3; i > 0; --i) { SmallVector Mask(SadElems, -1); for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) Mask[j] = MaskEnd + j; SDValue Shuffle = DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask); SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle); } } MVT Type = Extract->getSimpleValueType(0); unsigned TypeSizeInBits = Type.getSizeInBits(); // Return the lowest TypeSizeInBits bits. MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); SAD = DAG.getBitcast(ResVT, SAD); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, Extract->getOperand(1)); } // Attempt to peek through a target shuffle and extract the scalar from the // source. static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue Src = N->getOperand(0); SDValue Idx = N->getOperand(1); EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getVectorElementType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); // Don't attempt this for boolean mask vectors or unknown extraction indices. if (SrcSVT == MVT::i1 || !isa(Idx)) return SDValue(); SDValue SrcBC = peekThroughBitcasts(Src); // Handle extract(broadcast(scalar_value)), it doesn't matter what index is. if (X86ISD::VBROADCAST == SrcBC.getOpcode()) { SDValue SrcOp = SrcBC.getOperand(0); if (SrcOp.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, SrcOp); } // Resolve the target shuffle inputs and mask. SmallVector Mask; SmallVector Ops; if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. if (Mask.size() != NumSrcElts) { if ((NumSrcElts % Mask.size()) == 0) { SmallVector ScaledMask; int Scale = NumSrcElts / Mask.size(); scaleShuffleMask(Scale, Mask, ScaledMask); Mask = std::move(ScaledMask); } else if ((Mask.size() % NumSrcElts) == 0) { // Simplify Mask based on demanded element. int ExtractIdx = (int)N->getConstantOperandVal(1); int Scale = Mask.size() / NumSrcElts; int Lo = Scale * ExtractIdx; int Hi = Scale * (ExtractIdx + 1); for (int i = 0, e = (int)Mask.size(); i != e; ++i) if (i < Lo || Hi <= i) Mask[i] = SM_SentinelUndef; SmallVector WidenedMask; while (Mask.size() > NumSrcElts && canWidenShuffleElements(Mask, WidenedMask)) Mask = std::move(WidenedMask); // TODO - investigate support for wider shuffle masks with known upper // undef/zero elements for implicit zero-extension. } } // Check if narrowing/widening failed. if (Mask.size() != NumSrcElts) return SDValue(); int SrcIdx = Mask[N->getConstantOperandVal(1)]; SDLoc dl(N); // If the shuffle source element is undef/zero then we can just accept it. if (SrcIdx == SM_SentinelUndef) return DAG.getUNDEF(VT); if (SrcIdx == SM_SentinelZero) return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT) : DAG.getConstant(0, dl, VT); SDValue SrcOp = Ops[SrcIdx / Mask.size()]; SrcIdx = SrcIdx % Mask.size(); // We can only extract other elements from 128-bit vectors and in certain // circumstances, depending on SSE-level. // TODO: Investigate using extract_subvector for larger vectors. // TODO: Investigate float/double extraction if it will be just stored. if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) && ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { assert(SrcSVT == VT && "Unexpected extraction type"); SrcOp = DAG.getBitcast(SrcVT, SrcOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); } if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) || (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) { assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"); unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); SrcOp = DAG.getBitcast(SrcVT, SrcOp); SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); return DAG.getZExtOrTrunc(ExtOp, dl, VT); } return SDValue(); } /// Extracting a scalar FP value from vector element 0 is free, so extract each /// operand first, then perform the math as a scalar op. static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract"); SDValue Vec = ExtElt->getOperand(0); SDValue Index = ExtElt->getOperand(1); EVT VT = ExtElt->getValueType(0); EVT VecVT = Vec.getValueType(); // TODO: If this is a unary/expensive/expand op, allow extraction from a // non-zero element because the shuffle+scalar op will be cheaper? if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT) return SDValue(); // Vector FP compares don't fit the pattern of FP math ops (propagate, not // extract, the condition code), so deal with those as a special-case. if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) { EVT OpVT = Vec.getOperand(0).getValueType().getScalarType(); if (OpVT != MVT::f32 && OpVT != MVT::f64) return SDValue(); // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC SDLoc DL(ExtElt); SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Vec.getOperand(0), Index); SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Vec.getOperand(1), Index); return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2)); } if (VT != MVT::f32 && VT != MVT::f64) return SDValue(); // Vector FP selects don't fit the pattern of FP math ops (because the // condition has a different type and we have to change the opcode), so deal // with those here. // FIXME: This is restricted to pre type legalization by ensuring the setcc // has i1 elements. If we loosen this we need to convert vector bool to a // scalar bool. if (Vec.getOpcode() == ISD::VSELECT && Vec.getOperand(0).getOpcode() == ISD::SETCC && Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 && Vec.getOperand(0).getOperand(0).getValueType() == VecVT) { // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0) SDLoc DL(ExtElt); SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Vec.getOperand(0).getValueType().getScalarType(), Vec.getOperand(0), Index); SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(1), Index); SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(2), Index); return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2); } // TODO: This switch could include FNEG and the x86-specific FP logic ops // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid // missed load folding and fma+fneg combining. switch (Vec.getOpcode()) { case ISD::FMA: // Begin 3 operands case ISD::FMAD: case ISD::FADD: // Begin 2 operands case ISD::FSUB: case ISD::FMUL: case ISD::FDIV: case ISD::FREM: case ISD::FCOPYSIGN: case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: case ISD::FMAXIMUM: case ISD::FMINIMUM: case X86ISD::FMAX: case X86ISD::FMIN: case ISD::FABS: // Begin 1 operand case ISD::FSQRT: case ISD::FRINT: case ISD::FCEIL: case ISD::FTRUNC: case ISD::FNEARBYINT: case ISD::FROUND: case ISD::FFLOOR: case X86ISD::FRCP: case X86ISD::FRSQRT: { // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ... SDLoc DL(ExtElt); SmallVector ExtOps; for (SDValue Op : Vec->ops()) ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index)); return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps); } default: return SDValue(); } llvm_unreachable("All opcodes should return within switch"); } /// Try to convert a vector reduction sequence composed of binops and shuffles /// into horizontal ops. static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); if (!Subtarget.hasFastHorizontalOps() && !OptForSize) return SDValue(); SDValue Index = ExtElt->getOperand(1); if (!isNullConstant(Index)) return SDValue(); // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros. ISD::NodeType Opc; SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}); if (!Rdx) return SDValue(); EVT VT = ExtElt->getValueType(0); EVT VecVT = ExtElt->getOperand(0).getValueType(); if (VecVT.getScalarType() != VT) return SDValue(); unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; SDLoc DL(ExtElt); // 256-bit horizontal instructions operate on 128-bit chunks rather than // across the whole vector, so we need an extract + hop preliminary stage. // This is the only step where the operands of the hop are not the same value. // TODO: We could extend this to handle 512-bit or even longer vectors. if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) || ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) { unsigned NumElts = VecVT.getVectorNumElements(); SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL); SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL); VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2); Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo); } if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) && !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3())) return SDValue(); // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0 assert(Rdx.getValueType() == VecVT && "Unexpected reduction match"); unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements()); for (unsigned i = 0; i != ReductionSteps; ++i) Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); } /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading /// scalars back, while for x64 we should use 64-bit extracts and shifts. static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) return NewOp; SDValue InputVector = N->getOperand(0); SDValue EltIdx = N->getOperand(1); auto *CIdx = dyn_cast(EltIdx); EVT SrcVT = InputVector.getValueType(); EVT VT = N->getValueType(0); SDLoc dl(InputVector); bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT; if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements())) return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); // Integer Constant Folding. if (CIdx && VT.isInteger()) { APInt UndefVecElts; SmallVector EltBits; unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits(); if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts, EltBits, true, false)) { uint64_t Idx = CIdx->getZExtValue(); if (UndefVecElts[Idx]) return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()), dl, VT); } } // TODO - Remove this once we can handle the implicit zero-extension of // X86ISD::PEXTRW/X86ISD::PEXTRB in: // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and // combineBasicSADPattern. if (IsPextr) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits( SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI)) return SDValue(N, 0); return SDValue(); } if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; // Detect mmx extraction of all bits as a i64. It works better as a bitcast. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { SDValue MMXSrc = InputVector.getOperand(0); // The bitcast source is a direct mmx result. if (MMXSrc.getValueType() == MVT::x86mmx) return DAG.getBitcast(VT, InputVector); } // Detect mmx to i32 conversion through a v2i32 elt extract. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) { SDValue MMXSrc = InputVector.getOperand(0); // The bitcast source is a direct mmx result. if (MMXSrc.getValueType() == MVT::x86mmx) return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); } // Check whether this extract is the root of a sum of absolute differences // pattern. This has to be done here because we really want it to happen // pre-legalization, if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) return Cmp; // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW. if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget)) return V; if (SDValue V = scalarizeExtEltFP(N, DAG)) return V; // Attempt to extract a i1 element by using MOVMSK to extract the signbits // and then testing the relevant element. if (CIdx && SrcVT.getScalarType() == MVT::i1) { SmallVector BoolExtracts; auto IsBoolExtract = [&BoolExtracts](SDNode *Use) { if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && isa(Use->getOperand(1)) && Use->getValueType(0) == MVT::i1) { BoolExtracts.push_back(Use); return true; } return false; }; if (all_of(InputVector->uses(), IsBoolExtract) && BoolExtracts.size() > 1) { unsigned NumSrcElts = SrcVT.getVectorNumElements(); EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); if (SDValue BC = combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { for (SDNode *Use : BoolExtracts) { // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask unsigned MaskIdx = Use->getConstantOperandVal(1); APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx); SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT); SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask); Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ); DCI.CombineTo(Use, Res); } return SDValue(N, 0); } } } return SDValue(); } /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()? static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); SDLoc DL(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (N->getOpcode() != ISD::VSELECT) return SDValue(); assert(CondVT.isVector() && "Vector select expects a vector selector!"); // Check if the first operand is all zeros and Cond type is vXi1. // This situation only applies to avx512. // TODO: Use isNullOrNullSplat() to distinguish constants with undefs? // TODO: Can we assert that both operands are not zeros (because that should // get simplified at node creation time)? bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) SDValue CondNew = DAG.getNOT(DL, Cond, CondVT); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 return DAG.getSelect(DL, VT, CondNew, RHS, LHS); } // To use the condition operand as a bitwise mask, it must have elements that // are the same size as the select elements. Ie, the condition operand must // have already been promoted from the IR select condition type . // Don't check if the types themselves are equal because that excludes // vector floating-point selects. if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); // Try to invert the condition if true value is not all 1s and false value is // not all 0s. Only do this if the condition has one use. bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() && // Check if the selector will be produced by CMPP*/PCMP*. Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted. TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == CondVT) { bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); if (TValIsAllZeros || FValIsAllOnes) { SDValue CC = Cond.getOperand(2); ISD::CondCode NewCC = ISD::getSetCCInverse(cast(CC)->get(), Cond.getOperand(0).getValueType().isInteger()); Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); std::swap(LHS, RHS); TValIsAllOnes = FValIsAllOnes; FValIsAllZeros = TValIsAllZeros; } } // Cond value must be 'sign splat' to be converted to a logical op. if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits()) return SDValue(); // vselect Cond, 111..., 000... -> Cond if (TValIsAllOnes && FValIsAllZeros) return DAG.getBitcast(VT, Cond); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT)) return SDValue(); // vselect Cond, 111..., X -> or Cond, X if (TValIsAllOnes) { SDValue CastRHS = DAG.getBitcast(CondVT, RHS); SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS); return DAG.getBitcast(VT, Or); } // vselect Cond, X, 000... -> and Cond, X if (FValIsAllZeros) { SDValue CastLHS = DAG.getBitcast(CondVT, LHS); SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS); return DAG.getBitcast(VT, And); } // vselect Cond, 000..., X -> andn Cond, X if (TValIsAllZeros) { MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64); SDValue CastCond = DAG.getBitcast(AndNVT, Cond); SDValue CastRHS = DAG.getBitcast(AndNVT, RHS); SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS); return DAG.getBitcast(VT, AndN); } return SDValue(); } /// If both arms of a vector select are concatenated vectors, split the select, /// and concatenate the result to eliminate a wide (256-bit) vector instruction: /// vselect Cond, (concat T0, T1), (concat F0, F1) --> /// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1) static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT) return SDValue(); // TODO: Split 512-bit vectors too? EVT VT = N->getValueType(0); if (!VT.is256BitVector()) return SDValue(); // TODO: Split as long as any 2 of the 3 operands are concatenated? SDValue Cond = N->getOperand(0); SDValue TVal = N->getOperand(1); SDValue FVal = N->getOperand(2); SmallVector CatOpsT, CatOpsF; if (!TVal.hasOneUse() || !FVal.hasOneUse() || !collectConcatOps(TVal.getNode(), CatOpsT) || !collectConcatOps(FVal.getNode(), CatOpsF)) return SDValue(); auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops); }; return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal }, makeBlend, /*CheckBWI*/ false); } static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); SDLoc DL(N); auto *TrueC = dyn_cast(LHS); auto *FalseC = dyn_cast(RHS); if (!TrueC || !FalseC) return SDValue(); // Don't do this for crazy integer types. EVT VT = N->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); // We're going to use the condition bit in math or logic ops. We could allow // this with a wider condition value (post-legalization it becomes an i8), // but if nothing is creating selects that late, it doesn't matter. if (Cond.getValueType() != MVT::i1) return SDValue(); // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by // 3, 5, or 9 with i32/i64, so those get transformed too. // TODO: For constants that overflow or do not differ by power-of-2 or small // multiplier, convert to 'and' + 'add'. const APInt &TrueVal = TrueC->getAPIntValue(); const APInt &FalseVal = FalseC->getAPIntValue(); bool OV; APInt Diff = TrueVal.ssub_ov(FalseVal, OV); if (OV) return SDValue(); APInt AbsDiff = Diff.abs(); if (AbsDiff.isPowerOf2() || ((VT == MVT::i32 || VT == MVT::i64) && (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) { // We need a positive multiplier constant for shift/LEA codegen. The 'not' // of the condition can usually be folded into a compare predicate, but even // without that, the sequence should be cheaper than a CMOV alternative. if (TrueVal.slt(FalseVal)) { Cond = DAG.getNOT(DL, Cond, MVT::i1); std::swap(TrueC, FalseC); } // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); // Multiply condition by the difference if non-one. if (!AbsDiff.isOneValue()) R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT)); // Add the base if non-zero. if (!FalseC->isNullValue()) R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0)); return R; } return SDValue(); } /// If this is a *dynamic* select (non-constant condition) and we can match /// this node with one of the variable blend instructions, restructure the /// condition so that blends can use the high (sign) bit of each element. /// This function will also call SimplifyDemandedBits on already created /// BLENDV to perform additional simplifications. static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Cond = N->getOperand(0); if ((N->getOpcode() != ISD::VSELECT && N->getOpcode() != X86ISD::BLENDV) || ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); // Don't optimize before the condition has been transformed to a legal type // and don't ever optimize vector selects that map to AVX512 mask-registers. unsigned BitWidth = Cond.getScalarValueSizeInBits(); if (BitWidth < 8 || BitWidth > 64) return SDValue(); // We can only handle the cases where VSELECT is directly legal on the // subtarget. We custom lower VSELECT nodes with constant conditions and // this makes it hard to see whether a dynamic VSELECT will correctly // lower, so we both check the operation's status and explicitly handle the // cases where a *dynamic* blend will fail even though a constant-condition // blend could be custom lowered. // FIXME: We should find a better way to handle this class of problems. // Potentially, we should combine constant-condition vselect nodes // pre-legalization into shuffles and not mark as many types as custom // lowered. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) return SDValue(); // FIXME: We don't support i16-element blends currently. We could and // should support them by making *all* the bits in the condition be set // rather than just the high bit and using an i8-element blend. if (VT.getVectorElementType() == MVT::i16) return SDValue(); // Dynamic blending was only available from SSE4.1 onward. if (VT.is128BitVector() && !Subtarget.hasSSE41()) return SDValue(); // Byte blends are only available in AVX2 if (VT == MVT::v32i8 && !Subtarget.hasAVX2()) return SDValue(); // There are no 512-bit blend instructions that use sign bits. if (VT.is512BitVector()) return SDValue(); // TODO: Add other opcodes eventually lowered into BLEND. for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end(); UI != UE; ++UI) if ((UI->getOpcode() != ISD::VSELECT && UI->getOpcode() != X86ISD::BLENDV) || UI.getOperandNo() != 0) return SDValue(); APInt DemandedMask(APInt::getSignMask(BitWidth)); KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true)) return SDValue(); // If we changed the computation somewhere in the DAG, this change will // affect all users of Cond. Update all the nodes so that we do not use // the generic VSELECT anymore. Otherwise, we may perform wrong // optimizations as we messed with the actual expectation for the vector // boolean values. for (SDNode *U : Cond->uses()) { if (U->getOpcode() == X86ISD::BLENDV) continue; SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0), Cond, U->getOperand(1), U->getOperand(2)); DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); DCI.AddToWorklist(U); } DCI.CommitTargetLoweringOpt(TLO); return SDValue(N, 0); } /// Do target-specific dag combines on SELECT and VSELECT nodes. static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); // Try simplification again because we use this function to optimize // BLENDV nodes that are not handled by the generic combiner. if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS)) return V; EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Convert vselects with constant condition into shuffles. if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && DCI.isBeforeLegalizeOps()) { SmallVector Mask; if (createShuffleMaskFromVSELECT(Mask, Cond)) return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); } // If we have SSE[12] support, try to form min/max nodes. SSE min/max // instructions match the semantics of the common C idiom x(Cond.getOperand(2))->get(); unsigned Opcode = 0; // Check for x CC y ? x : y. if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { switch (CC) { default: break; case ISD::SETULT: // Converting this to a min would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) break; Opcode = X86ISD::FMIN; break; case ISD::SETULE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMIN; break; case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETUGT: // Converting this to a max would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETUGE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMAX; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && DAG.isEqualTo(RHS, Cond.getOperand(0))) { switch (CC) { default: break; case ISD::SETOGE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; Opcode = X86ISD::FMIN; break; case ISD::SETUGE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMIN; break; case ISD::SETULT: // Converting this to a max would handle NaNs incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETOLE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETULE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMAX; break; } } if (Opcode) return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } // Some mask scalar intrinsics rely on checking if only one bit is set // and implement it in C code like this: // A[0] = (U & 1) ? A[0] : W[0]; // This creates some redundant instructions that break pattern matching. // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y) if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); SDValue AndNode = Cond.getOperand(0); if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ && isNullConstant(Cond.getOperand(1)) && isOneConstant(AndNode.getOperand(1))) { // LHS and RHS swapped due to // setcc outputting 1 when AND resulted in 0 and vice versa. AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8); return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS); } } // v16i8 (select v16i1, v16i8, v16i8) does not have a proper // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. // The same situation all vectors of i8 and i16 without BWI. // Make sure we extend these even before type legalization gets a chance to // split wide vectors. // Since SKX these selects have a proper lowering. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1 && (ExperimentalVectorWideningLegalization || VT.getVectorNumElements() > 4) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS); } // AVX512 - Extend select with zero to merge with target shuffle. // select(mask, extract_subvector(shuffle(x)), zero) --> // extract_subvector(select(insert_subvector(mask), shuffle(x), zero)) // TODO - support non target shuffles as well. if (Subtarget.hasAVX512() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1) { auto SelectableOp = [&TLI](SDValue Op) { return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && isTargetShuffle(Op.getOperand(0).getOpcode()) && isNullConstant(Op.getOperand(1)) && TLI.isTypeLegal(Op.getOperand(0).getValueType()) && Op.hasOneUse() && Op.getOperand(0).hasOneUse(); }; bool SelectableLHS = SelectableOp(LHS); bool SelectableRHS = SelectableOp(RHS); bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode()); bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode()); if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) { EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType() : RHS.getOperand(0).getValueType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts); LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL, VT.getSizeInBits()); RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL, VT.getSizeInBits()); Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT, DAG.getUNDEF(SrcCondVT), Cond, DAG.getIntPtrConstant(0, DL)); SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS); return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits()); } } if (SDValue V = combineSelectOfTwoConstants(N, DAG)) return V; // Canonicalize max and min: // (x > y) ? x : y -> (x >= y) ? x : y // (x < y) ? x : y -> (x <= y) ? x : y // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates // the need for an extra compare // against zero. e.g. // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 // subl %esi, %edi // testl %edi, %edi // movl $0, %eax // cmovgl %edi, %eax // => // xorl %eax, %eax // subl %esi, $edi // cmovsl %eax, %edi if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); switch (CC) { default: break; case ISD::SETLT: case ISD::SETGT: { ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); return DAG.getSelect(DL, VT, Cond, LHS, RHS); } } } // Match VSELECTs into subs with unsigned saturation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // psubus is available in SSE2 for i8 and i16 vectors. Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 && isPowerOf2_32(VT.getVectorNumElements()) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); // Check if one of the arms of the VSELECT is a zero vector. If it's on the // left side invert the predicate to simplify logic below. SDValue Other; if (ISD::isBuildVectorAllZeros(LHS.getNode())) { Other = RHS; CC = ISD::getSetCCInverse(CC, true); } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { Other = LHS; } if (Other.getNode() && Other->getNumOperands() == 2 && Other->getOperand(0) == Cond.getOperand(0)) { SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); SDValue CondRHS = Cond->getOperand(1); // Look for a general sub with unsigned saturation first. // x >= y ? x-y : 0 --> subus x, y // x > y ? x-y : 0 --> subus x, y if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && Other->getOpcode() == ISD::SUB && OpRHS == CondRHS) return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); if (auto *OpRHSBV = dyn_cast(OpRHS)) { if (isa(CondRHS)) { // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { return (!Op && !Cond) || (Op && Cond && Cond->getAPIntValue() == (-Op->getAPIntValue() - 1)); }; if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT, /*AllowUndefs*/ true)) { OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), OpRHS); return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); } // Another special case: If C was a sign bit, the sub has been // canonicalized into a xor. // FIXME: Would it be better to use computeKnownBits to determine // whether it's safe to decanonicalize the xor? // x s< 0 ? x^C : 0 --> subus x, C if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR && ISD::isBuildVectorAllZeros(CondRHS.getNode()) && OpRHSConst->getAPIntValue().isSignMask()) { // Note that we have to rebuild the RHS constant here to ensure we // don't rely on particular values of undef lanes. OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT); return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS); } } } } } } // Match VSELECTs into add with unsigned saturation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // paddus is available in SSE2 for i8 and i16 vectors. Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 && isPowerOf2_32(VT.getVectorNumElements()) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); SDValue CondLHS = Cond->getOperand(0); SDValue CondRHS = Cond->getOperand(1); // Check if one of the arms of the VSELECT is vector with all bits set. // If it's on the left side invert the predicate to simplify logic below. SDValue Other; if (ISD::isBuildVectorAllOnes(LHS.getNode())) { Other = RHS; CC = ISD::getSetCCInverse(CC, true); } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) { Other = LHS; } if (Other.getNode() && Other.getOpcode() == ISD::ADD) { SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); // Canonicalize condition operands. if (CC == ISD::SETUGE) { std::swap(CondLHS, CondRHS); CC = ISD::SETULE; } // We can test against either of the addition operands. // x <= x+y ? x+y : ~0 --> addus x, y // x+y >= x ? x+y : ~0 --> addus x, y if (CC == ISD::SETULE && Other == CondRHS && (OpLHS == CondLHS || OpRHS == CondLHS)) return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); if (isa(OpRHS) && isa(CondRHS) && CondLHS == OpLHS) { // If the RHS is a constant we have to reverse the const // canonicalization. // x > ~C ? x+C : ~0 --> addus x, C auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { return Cond->getAPIntValue() == ~Op->getAPIntValue(); }; if (CC == ISD::SETULE && ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT)) return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); } } } // Early exit check if (!TLI.isTypeLegal(VT)) return SDValue(); if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) return V; if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget)) return V; if (SDValue V = narrowVectorSelect(N, DAG, Subtarget)) return V; // Custom action for SELECT MMX if (VT == MVT::x86mmx) { LHS = DAG.getBitcast(MVT::i64, LHS); RHS = DAG.getBitcast(MVT::i64, RHS); SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS); return DAG.getBitcast(VT, newSelect); } return SDValue(); } /// Combine: /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) /// to: /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE) /// i.e., reusing the EFLAGS produced by the LOCKed instruction. /// Note that this is only legal for some op/cc combinations. static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // This combine only operates on CMP-like nodes. if (!(Cmp.getOpcode() == X86ISD::CMP || (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) return SDValue(); // Can't replace the cmp if it has more uses than the one we're looking at. // FIXME: We would like to be able to handle this, but would need to make sure // all uses were updated. if (!Cmp.hasOneUse()) return SDValue(); // This only applies to variations of the common case: // (icmp slt x, 0) -> (icmp sle (add x, 1), 0) // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0) // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0) // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0) // Using the proper condcodes (see below), overflow is checked for. // FIXME: We can generalize both constraints: // - XOR/OR/AND (if they were made to survive AtomicExpand) // - LHS != 1 // if the result is compared. SDValue CmpLHS = Cmp.getOperand(0); SDValue CmpRHS = Cmp.getOperand(1); if (!CmpLHS.hasOneUse()) return SDValue(); unsigned Opc = CmpLHS.getOpcode(); if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB) return SDValue(); SDValue OpRHS = CmpLHS.getOperand(2); auto *OpRHSC = dyn_cast(OpRHS); if (!OpRHSC) return SDValue(); APInt Addend = OpRHSC->getAPIntValue(); if (Opc == ISD::ATOMIC_LOAD_SUB) Addend = -Addend; auto *CmpRHSC = dyn_cast(CmpRHS); if (!CmpRHSC) return SDValue(); APInt Comparison = CmpRHSC->getAPIntValue(); // If the addend is the negation of the comparison value, then we can do // a full comparison by emitting the atomic arithmetic as a locked sub. if (Comparison == -Addend) { // The CC is fine, but we need to rewrite the LHS of the comparison as an // atomic sub. auto *AN = cast(CmpLHS.getNode()); auto AtomicSub = DAG.getAtomic( ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(), /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1), /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()), AN->getMemOperand()); auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpLHS.getValueType())); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); return LockOp; } // We can handle comparisons with zero in a number of cases by manipulating // the CC used. if (!Comparison.isNullValue()) return SDValue(); if (CC == X86::COND_S && Addend == 1) CC = X86::COND_LE; else if (CC == X86::COND_NS && Addend == 1) CC = X86::COND_G; else if (CC == X86::COND_G && Addend == -1) CC = X86::COND_GE; else if (CC == X86::COND_LE && Addend == -1) CC = X86::COND_L; else return SDValue(); SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpLHS.getValueType())); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); return LockOp; } // Check whether a boolean test is testing a boolean value generated by // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition // code. // // Simplify the following patterns: // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) // to (Op EFLAGS Cond) // // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) // to (Op EFLAGS !Cond) // // where Op could be BRCOND or CMOV. // static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // This combine only operates on CMP-like nodes. if (!(Cmp.getOpcode() == X86ISD::CMP || (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) return SDValue(); // Quit if not used as a boolean value. if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); // Check CMP operands. One of them should be 0 or 1 and the other should be // an SetCC or extended from it. SDValue Op1 = Cmp.getOperand(0); SDValue Op2 = Cmp.getOperand(1); SDValue SetCC; const ConstantSDNode* C = nullptr; bool needOppositeCond = (CC == X86::COND_E); bool checkAgainstTrue = false; // Is it a comparison against 1? if ((C = dyn_cast(Op1))) SetCC = Op2; else if ((C = dyn_cast(Op2))) SetCC = Op1; else // Quit if all operands are not constants. return SDValue(); if (C->getZExtValue() == 1) { needOppositeCond = !needOppositeCond; checkAgainstTrue = true; } else if (C->getZExtValue() != 0) // Quit if the constant is neither 0 or 1. return SDValue(); bool truncatedToBoolWithAnd = false; // Skip (zext $x), (trunc $x), or (and $x, 1) node. while (SetCC.getOpcode() == ISD::ZERO_EXTEND || SetCC.getOpcode() == ISD::TRUNCATE || SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; if (isOneConstant(SetCC.getOperand(0))) OpIdx = 1; if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; if (OpIdx < 0) break; SetCC = SetCC.getOperand(OpIdx); truncatedToBoolWithAnd = true; } else SetCC = SetCC.getOperand(0); } switch (SetCC.getOpcode()) { case X86ISD::SETCC_CARRY: // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, // i.e. it's a comparison against true but the result of SETCC_CARRY is not // truncated to i1 using 'and'. if (checkAgainstTrue && !truncatedToBoolWithAnd) break; assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!"); LLVM_FALLTHROUGH; case X86ISD::SETCC: // Set the condition code or opposite one if necessary. CC = X86::CondCode(SetCC.getConstantOperandVal(0)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(1); case X86ISD::CMOV: { // Check whether false/true value has canonical one, i.e. 0 or 1. ConstantSDNode *FVal = dyn_cast(SetCC.getOperand(0)); ConstantSDNode *TVal = dyn_cast(SetCC.getOperand(1)); // Quit if true value is not a constant. if (!TVal) return SDValue(); // Quit if false value is not a constant. if (!FVal) { SDValue Op = SetCC.getOperand(0); // Skip 'zext' or 'trunc' node. if (Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::TRUNCATE) Op = Op.getOperand(0); // A special case for rdrand/rdseed, where 0 is set if false cond is // found. if ((Op.getOpcode() != X86ISD::RDRAND && Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) return SDValue(); } // Quit if false value is not the constant 0 or 1. bool FValIsFalse = true; if (FVal && FVal->getZExtValue() != 0) { if (FVal->getZExtValue() != 1) return SDValue(); // If FVal is 1, opposite cond is needed. needOppositeCond = !needOppositeCond; FValIsFalse = false; } // Quit if TVal is not the constant opposite of FVal. if (FValIsFalse && TVal->getZExtValue() != 1) return SDValue(); if (!FValIsFalse && TVal->getZExtValue() != 0) return SDValue(); CC = X86::CondCode(SetCC.getConstantOperandVal(2)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(3); } } return SDValue(); } /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. /// Match: /// (X86or (X86setcc) (X86setcc)) /// (X86cmp (and (X86setcc) (X86setcc)), 0) static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd) { if (Cond->getOpcode() == X86ISD::CMP) { if (!isNullConstant(Cond->getOperand(1))) return false; Cond = Cond->getOperand(0); } isAnd = false; SDValue SetCC0, SetCC1; switch (Cond->getOpcode()) { default: return false; case ISD::AND: case X86ISD::AND: isAnd = true; LLVM_FALLTHROUGH; case ISD::OR: case X86ISD::OR: SetCC0 = Cond->getOperand(0); SetCC1 = Cond->getOperand(1); break; }; // Make sure we have SETCC nodes, using the same flags value. if (SetCC0.getOpcode() != X86ISD::SETCC || SetCC1.getOpcode() != X86ISD::SETCC || SetCC0->getOperand(1) != SetCC1->getOperand(1)) return false; CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); Flags = SetCC0->getOperand(1); return true; } // When legalizing carry, we create carries via add X, -1 // If that comes from an actual carry, via setcc, we use the // carry directly. static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { if (EFLAGS.getOpcode() == X86ISD::ADD) { if (isAllOnesConstant(EFLAGS.getOperand(1))) { SDValue Carry = EFLAGS.getOperand(0); while (Carry.getOpcode() == ISD::TRUNCATE || Carry.getOpcode() == ISD::ZERO_EXTEND || Carry.getOpcode() == ISD::SIGN_EXTEND || Carry.getOpcode() == ISD::ANY_EXTEND || (Carry.getOpcode() == ISD::AND && isOneConstant(Carry.getOperand(1)))) Carry = Carry.getOperand(0); if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB? uint64_t CarryCC = Carry.getConstantOperandVal(0); SDValue CarryOp1 = Carry.getOperand(1); if (CarryCC == X86::COND_B) return CarryOp1; if (CarryCC == X86::COND_A) { // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // // Do not flip "e > c", where "c" is a constant, because Cmp // instruction cannot take an immediate as its first operand. // if (CarryOp1.getOpcode() == X86ISD::SUB && CarryOp1.getNode()->hasOneUse() && CarryOp1.getValueType().isInteger() && !isa(CarryOp1.getOperand(1))) { SDValue SubCommute = DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(), CarryOp1.getOperand(1), CarryOp1.getOperand(0)); return SDValue(SubCommute.getNode(), CarryOp1.getResNo()); } } // If this is a check of the z flag of an add with 1, switch to the // C flag. if (CarryCC == X86::COND_E && CarryOp1.getOpcode() == X86ISD::ADD && isOneConstant(CarryOp1.getOperand(1))) return CarryOp1; } } } return SDValue(); } /// Optimize an EFLAGS definition used according to the condition code \p CC /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing /// uses of chain values. static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (CC == X86::COND_B) if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG)) return Flags; if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) return R; return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue FalseOp = N->getOperand(0); SDValue TrueOp = N->getOperand(1); X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); // cmov X, X, ?, ? --> X if (TrueOp == FalseOp) return TrueOp; // Try to simplify the EFLAGS and condition code operands. // We can't always do this as FCMOV only supports a subset of X86 cond. if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), Flags}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. if (ConstantSDNode *TrueC = dyn_cast(TrueOp)) { if (ConstantSDNode *FalseC = dyn_cast(FalseOp)) { // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is // larger than FalseC (the false value). if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); std::swap(TrueOp, FalseOp); } // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. // This is efficient for any integer data type (including i8/i16) and // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); unsigned ShAmt = TrueC->getAPIntValue().logBase2(); Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, DAG.getConstant(ShAmt, DL, MVT::i8)); return Cond; } // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); return Cond; } // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; bool isFastMultiplier = false; if (Diff < 10) { switch ((unsigned char)Diff) { default: break; case 1: // result = add base, cond case 2: // result = lea base( , cond*2) case 3: // result = lea base(cond, cond*2) case 4: // result = lea base( , cond*4) case 5: // result = lea base(cond, cond*4) case 8: // result = lea base( , cond*8) case 9: // result = lea base(cond, cond*8) isFastMultiplier = true; break; } } if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); Cond = getSETCC(CC, Cond, DL ,DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, DAG.getConstant(Diff, DL, Cond.getValueType())); // Add the base if non-zero. if (FalseC->getAPIntValue() != 0) Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); return Cond; } } } } // Handle these cases: // (select (x != c), e, c) -> select (x != c), e, x), // (select (x == c), c, e) -> select (x == c), x, e) // where the c is an integer constant, and the "select" is the combination // of CMOV and CMP. // // The rationale for this change is that the conditional-move from a constant // needs two instructions, however, conditional-move from a register needs // only one instruction. // // CAVEAT: By replacing a constant with a symbolic value, it may obscure // some instruction-combining opportunities. This opt needs to be // postponed as late as possible. // if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { // the DCI.xxxx conditions are provided to postpone the optimization as // late as possible. ConstantSDNode *CmpAgainst = nullptr; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast(Cond.getOperand(1))) && !isa(Cond.getOperand(0))) { if (CC == X86::COND_NE && CmpAgainst == dyn_cast(FalseOp)) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueOp, FalseOp); } if (CC == X86::COND_E && CmpAgainst == dyn_cast(TrueOp)) { SDValue Ops[] = { FalseOp, Cond.getOperand(0), DAG.getConstant(CC, DL, MVT::i8), Cond }; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } } // Fold and/or of setcc's to double CMOV: // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) // // This combine lets us generate: // cmovcc1 (jcc1 if we don't have CMOV) // cmovcc2 (same) // instead of: // setcc1 // setcc2 // and/or // cmovne (jne if we don't have CMOV) // When we can't use the CMOV instruction, it might increase branch // mispredicts. // When we can use CMOV, or when there is no mispredict, this improves // throughput and reduces register pressure. // if (CC == X86::COND_NE) { SDValue Flags; X86::CondCode CC0, CC1; bool isAndSetCC; if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { if (isAndSetCC) { std::swap(FalseOp, TrueOp); CC0 = X86::GetOppositeBranchCondition(CC0); CC1 = X86::GetOppositeBranchCondition(CC1); } SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), Flags}; SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps); SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); return CMOV; } } // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) -> // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2) // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) -> // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2) if ((CC == X86::COND_NE || CC == X86::COND_E) && Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) { SDValue Add = TrueOp; SDValue Const = FalseOp; // Canonicalize the condition code for easier matching and output. if (CC == X86::COND_E) std::swap(Add, Const); // We might have replaced the constant in the cmov with the LHS of the // compare. If so change it to the RHS of the compare. if (Const == Cond.getOperand(0)) Const = Cond.getOperand(1); // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant. if (isa(Const) && Add.getOpcode() == ISD::ADD && Add.hasOneUse() && isa(Add.getOperand(1)) && (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF || Add.getOperand(0).getOpcode() == ISD::CTTZ) && Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) { EVT VT = N->getValueType(0); // This should constant fold. SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1)); SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0), DAG.getConstant(X86::COND_NE, DL, MVT::i8), Cond); return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1)); } } return SDValue(); } /// Different mul shrinking modes. enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { EVT VT = N->getOperand(0).getValueType(); if (VT.getScalarSizeInBits() != 32) return false; assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2"); unsigned SignBits[2] = {1, 1}; bool IsPositive[2] = {false, false}; for (unsigned i = 0; i < 2; i++) { SDValue Opd = N->getOperand(i); SignBits[i] = DAG.ComputeNumSignBits(Opd); IsPositive[i] = DAG.SignBitIsZero(Opd); } bool AllPositive = IsPositive[0] && IsPositive[1]; unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); // When ranges are from -128 ~ 127, use MULS8 mode. if (MinSignBits >= 25) Mode = MULS8; // When ranges are from 0 ~ 255, use MULU8 mode. else if (AllPositive && MinSignBits >= 24) Mode = MULU8; // When ranges are from -32768 ~ 32767, use MULS16 mode. else if (MinSignBits >= 17) Mode = MULS16; // When ranges are from 0 ~ 65535, use MULU16 mode. else if (AllPositive && MinSignBits >= 16) Mode = MULU16; else return false; return true; } /// When the operands of vector mul are extended from smaller size values, /// like i8 and i16, the type of mul may be shrinked to generate more /// efficient code. Two typical patterns are handled: /// Pattern1: /// %2 = sext/zext %1 to /// %4 = sext/zext %3 to // or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) /// %5 = mul %2, %4 /// /// Pattern2: /// %2 = zext/sext %1 to /// %4 = zext/sext %3 to /// or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) /// %5 = mul %2, %4 /// /// There are four mul shrinking modes: /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is /// -128 to 128, and the scalar value range of %4 is also -128 to 128, /// generate pmullw+sext32 for it (MULS8 mode). /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is /// 0 to 255, and the scalar value range of %4 is also 0 to 255, /// generate pmullw+zext32 for it (MULU8 mode). /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767, /// generate pmullw+pmulhw for it (MULS16 mode). /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535, /// generate pmullw+pmulhuw for it (MULU16 mode). static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Check for legality // pmullw/pmulhw are not supported by SSE. if (!Subtarget.hasSSE2()) return SDValue(); // Check for profitability // pmulld is supported since SSE41. It is better to use pmulld // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than // the expansion. bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow())) return SDValue(); ShrinkMode Mode; if (!canReduceVMulWidth(N, DAG, Mode)) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getOperand(0).getValueType(); unsigned NumElts = VT.getVectorNumElements(); if ((NumElts % 2) != 0) return SDValue(); unsigned RegSize = 128; MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); // Shrink the operands of mul. SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); if (ExperimentalVectorWideningLegalization || NumElts >= OpsVT.getVectorNumElements()) { // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the // lower part is needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); if (Mode == MULU8 || Mode == MULS8) return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT, MulLo); MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, // the higher part is also needed. SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, ReducedVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider // result. // Generate shuffle functioning as punpcklwd. SmallVector ShuffleMask(NumElts); for (unsigned i = 0, e = NumElts / 2; i < e; i++) { ShuffleMask[2 * i] = i; ShuffleMask[2 * i + 1] = i + NumElts; } SDValue ResLo = DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); ResLo = DAG.getBitcast(ResVT, ResLo); // Generate shuffle functioning as punpckhwd. for (unsigned i = 0, e = NumElts / 2; i < e; i++) { ShuffleMask[2 * i] = i + NumElts / 2; ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; } SDValue ResHi = DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); ResHi = DAG.getBitcast(ResVT, ResHi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); } // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want // to legalize the mul explicitly because implicit legalization for type // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack // instructions which will not exist when we explicitly legalize it by // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with // <4 x i16> undef). // // Legalize the operands of mul. // FIXME: We may be able to handle non-concatenated vectors by insertion. unsigned ReducedSizeInBits = ReducedVT.getSizeInBits(); if ((RegSize % ReducedSizeInBits) != 0) return SDValue(); SmallVector Ops(RegSize / ReducedSizeInBits, DAG.getUNDEF(ReducedVT)); Ops[0] = NewN0; NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); Ops[0] = NewN1; NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); if (Mode == MULU8 || Mode == MULS8) { // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower // part is needed. SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); // convert the type of mul result to VT. MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG : ISD::SIGN_EXTEND_VECTOR_INREG, DL, ResVT, Mul); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } // Generate the lower and higher part of mul: pmulhw/pmulhuw. For // MULU16/MULS16, both parts are needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, OpsVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider // result. Make sure the type of mul result is VT. MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi); Res = DAG.getBitcast(ResVT, Res); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL) { auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) { SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(Mult, DL, VT)); Result = DAG.getNode(ISD::SHL, DL, VT, Result, DAG.getConstant(Shift, DL, MVT::i8)); Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, N->getOperand(0)); return Result; }; auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) { SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(Mul1, DL, VT)); Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result, DAG.getConstant(Mul2, DL, VT)); Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, N->getOperand(0)); return Result; }; switch (MulAmt) { default: break; case 11: // mul x, 11 => add ((shl (mul x, 5), 1), x) return combineMulShlAddOrSub(5, 1, /*isAdd*/ true); case 21: // mul x, 21 => add ((shl (mul x, 5), 2), x) return combineMulShlAddOrSub(5, 2, /*isAdd*/ true); case 41: // mul x, 41 => add ((shl (mul x, 5), 3), x) return combineMulShlAddOrSub(5, 3, /*isAdd*/ true); case 22: // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x) return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), combineMulShlAddOrSub(5, 2, /*isAdd*/ true)); case 19: // mul x, 19 => add ((shl (mul x, 9), 1), x) return combineMulShlAddOrSub(9, 1, /*isAdd*/ true); case 37: // mul x, 37 => add ((shl (mul x, 9), 2), x) return combineMulShlAddOrSub(9, 2, /*isAdd*/ true); case 73: // mul x, 73 => add ((shl (mul x, 9), 3), x) return combineMulShlAddOrSub(9, 3, /*isAdd*/ true); case 13: // mul x, 13 => add ((shl (mul x, 3), 2), x) return combineMulShlAddOrSub(3, 2, /*isAdd*/ true); case 23: // mul x, 23 => sub ((shl (mul x, 3), 3), x) return combineMulShlAddOrSub(3, 3, /*isAdd*/ false); case 26: // mul x, 26 => add ((mul (mul x, 5), 5), x) return combineMulMulAddOrSub(5, 5, /*isAdd*/ true); case 28: // mul x, 28 => add ((mul (mul x, 9), 3), x) return combineMulMulAddOrSub(9, 3, /*isAdd*/ true); case 29: // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x) return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), combineMulMulAddOrSub(9, 3, /*isAdd*/ true)); } // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed // by a single LEA. // First check if this a sum of two power of 2s because that's easy. Then // count how many zeros are up to the first bit. // TODO: We can do this even without LEA at a cost of two shifts and an add. if (isPowerOf2_64(MulAmt & (MulAmt - 1))) { unsigned ScaleShift = countTrailingZeros(MulAmt); if (ScaleShift >= 1 && ScaleShift < 4) { unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1))); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(ShiftAmt, DL, MVT::i8)); SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(ScaleShift, DL, MVT::i8)); return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2); } } return SDValue(); } // If the upper 17 bits of each element are zero then we can use PMADDWD, // which is always at least as quick as PMULLD, except on KNL. static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); if (Subtarget.isPMADDWDSlow()) return SDValue(); EVT VT = N->getValueType(0); // Only support vXi32 vectors. if (!VT.isVector() || VT.getVectorElementType() != MVT::i32) return SDValue(); // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. // Also allow v2i32 if it will be widened. MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) || DAG.getTargetLoweringInfo().isTypeLegal(WVT))) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // If we are zero extending two steps without SSE4.1, its better to reduce // the vmul width instead. if (!Subtarget.hasSSE41() && (N0.getOpcode() == ISD::ZERO_EXTEND && N0.getOperand(0).getScalarValueSizeInBits() <= 8) && (N1.getOpcode() == ISD::ZERO_EXTEND && N1.getOperand(0).getScalarValueSizeInBits() <= 8)) return SDValue(); APInt Mask17 = APInt::getHighBitsSet(32, 17); if (!DAG.MaskedValueIsZero(N1, Mask17) || !DAG.MaskedValueIsZero(N0, Mask17)) return SDValue(); // Use SplitOpsAndApply to handle AVX splitting. auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) }, PMADDWDBuilder); } static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); EVT VT = N->getValueType(0); // Only support vXi64 vectors. if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 || VT.getVectorNumElements() < 2 || !isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // MULDQ returns the 64-bit result of the signed multiplication of the lower // 32-bits. We can lower with this if the sign bits stretch that far. if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 && DAG.ComputeNumSignBits(N1) > 32) { auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops); }; return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 }, PMULDQBuilder, /*CheckBWI*/false); } // If the upper bits are zero we can use a single pmuludq. APInt Mask = APInt::getHighBitsSet(64, 32); if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) { auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops); }; return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 }, PMULUDQBuilder, /*CheckBWI*/false); } return SDValue(); } /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget)) return V; if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget)) return V; if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); if (!MulConstantOptimization) return SDValue(); // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); if (VT != MVT::i64 && VT != MVT::i32) return SDValue(); ConstantSDNode *C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); if (isPowerOf2_64(C->getZExtValue())) return SDValue(); int64_t SignMulAmt = C->getSExtValue(); assert(SignMulAmt != INT64_MIN && "Int min should have been handled!"); uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt; SDLoc DL(N); if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) { SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(AbsMulAmt, DL, VT)); if (SignMulAmt < 0) NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); return NewMul; } uint64_t MulAmt1 = 0; uint64_t MulAmt2 = 0; if ((AbsMulAmt % 9) == 0) { MulAmt1 = 9; MulAmt2 = AbsMulAmt / 9; } else if ((AbsMulAmt % 5) == 0) { MulAmt1 = 5; MulAmt2 = AbsMulAmt / 5; } else if ((AbsMulAmt % 3) == 0) { MulAmt1 = 3; MulAmt2 = AbsMulAmt / 3; } SDValue NewMul; // For negative multiply amounts, only allow MulAmt2 to be a power of 2. if (MulAmt2 && (isPowerOf2_64(MulAmt2) || (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) { if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) // If second multiplifer is pow2, issue it first. We want the multiply by // 3, 5, or 9 to be folded into the addressing mode unless the lone use // is an add. Only do this for positive multiply amounts since the // negate would prevent it from being used as an address mode anyway. std::swap(MulAmt1, MulAmt2); if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(MulAmt1, DL, VT)); if (isPowerOf2_64(MulAmt2)) NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); // Negate the result. if (SignMulAmt < 0) NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); } else if (!Subtarget.slowLEA()) NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL); if (!NewMul) { assert(C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && "Both cases that could cause potential overflows should have " "already been handled."); if (isPowerOf2_64(AbsMulAmt - 1)) { // (mul x, 2^N + 1) => (add (shl x, N), x) NewMul = DAG.getNode( ISD::ADD, DL, VT, N->getOperand(0), DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, MVT::i8))); // To negate, subtract the number from zero if (SignMulAmt < 0) NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); } else if (isPowerOf2_64(AbsMulAmt + 1)) { // (mul x, 2^N - 1) => (sub (shl x, N), x) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, MVT::i8)); // To negate, reverse the operands of the subtract. if (SignMulAmt < 0) NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul); else NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0)); } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) { // (mul x, 2^N + 2) => (add (add (shl x, N), x), x) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, MVT::i8)); NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0)); NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0)); } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) { // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, MVT::i8)); NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0)); NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0)); } } return NewMul; } static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. if (VT.isInteger() && !VT.isVector() && N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); APInt Mask = N0.getConstantOperandAPInt(1); Mask <<= N1C->getAPIntValue(); bool MaskOK = false; // We can handle cases concerning bit-widening nodes containing setcc_c if // we carefully interrogate the mask to make sure we are semantics // preserving. // The transform is not safe if the result of C1 << C2 exceeds the bitwidth // of the underlying setcc_c operation if the setcc_c was zero extended. // Consider the following example: // zext(setcc_c) -> i32 0x0000FFFF // c1 -> i32 0x0000FFFF // c2 -> i32 0x00000001 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE if (N00.getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = true; } else if (N00.getOpcode() == ISD::SIGN_EXTEND && N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = true; } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || N00.getOpcode() == ISD::ANY_EXTEND) && N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); } if (MaskOK && Mask != 0) { SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); } } // Hardware support for vector shifts is sparse which makes us scalarize the // vector operations in many cases. Also, on sandybridge ADD is faster than // shl. // (shl V, 1) -> add V,V if (auto *N1BV = dyn_cast(N1)) if (auto *N1SplatC = N1BV->getConstantSplatNode()) { assert(N0.getValueType().isVector() && "Invalid vector shift type"); // We shift all of the values by one. In many cases we do not have // hardware support for this operation. This is better expressed as an ADD // of two values. if (N1SplatC->getAPIntValue() == 1) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } return SDValue(); } static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned Size = VT.getSizeInBits(); // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) // depending on sign of (SarConst - [56,48,32,24,16]) // sexts in X86 are MOVs. The MOVs have the same code size // as above SHIFTs (only SHIFT on 1 has lower code size). // However the MOVs have 2 advantages to a SHIFT: // 1. MOVs can write to a register that differs from source // 2. MOVs accept memory operands if (VT.isVector() || N1.getOpcode() != ISD::Constant || N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || N0.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); APInt ShlConst = (cast(N01))->getAPIntValue(); APInt SarConst = (cast(N1))->getAPIntValue(); EVT CVT = N1.getValueType(); if (SarConst.isNegative()) return SDValue(); for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) { unsigned ShiftSize = SVT.getSizeInBits(); // skipping types without corresponding sext/zext and // ShlConst that is not one of [56,48,32,24,16] if (ShiftSize >= Size || ShlConst != Size - ShiftSize) continue; SDLoc DL(N); SDValue NN = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); SarConst = SarConst - (Size - ShiftSize); if (SarConst == 0) return NN; else if (SarConst.isNegative()) return DAG.getNode(ISD::SHL, DL, VT, NN, DAG.getConstant(-SarConst, DL, CVT)); else return DAG.getNode(ISD::SRA, DL, VT, NN, DAG.getConstant(SarConst, DL, CVT)); } return SDValue(); } static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); // Only do this on the last DAG combine as it can interfere with other // combines. if (!DCI.isAfterLegalizeDAG()) return SDValue(); // Try to improve a sequence of srl (and X, C1), C2 by inverting the order. // TODO: This is a generic DAG combine that became an x86-only combine to // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and // and-not ('andn'). if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) return SDValue(); auto *ShiftC = dyn_cast(N1); auto *AndC = dyn_cast(N0.getOperand(1)); if (!ShiftC || !AndC) return SDValue(); // If we can shrink the constant mask below 8-bits or 32-bits, then this // transform should reduce code size. It may also enable secondary transforms // from improved known-bits analysis or instruction selection. APInt MaskVal = AndC->getAPIntValue(); // If this can be matched by a zero extend, don't optimize. if (MaskVal.isMask()) { unsigned TO = MaskVal.countTrailingOnes(); if (TO >= 8 && isPowerOf2_32(TO)) return SDValue(); } APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue()); unsigned OldMaskSize = MaskVal.getMinSignedBits(); unsigned NewMaskSize = NewMaskVal.getMinSignedBits(); if ((OldMaskSize > 8 && NewMaskSize <= 8) || (OldMaskSize > 32 && NewMaskSize <= 32)) { // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC) SDLoc DL(N); SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT); SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1); return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask); } return SDValue(); } static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && "Unexpected shift opcode"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned DstBitsPerElt = VT.getScalarSizeInBits(); unsigned SrcBitsPerElt = 2 * DstBitsPerElt; assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"); bool IsSigned = (X86ISD::PACKSS == Opcode); // Constant Folding. APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) && (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) && getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumDstElts = VT.getVectorNumElements(); unsigned NumSrcElts = NumDstElts / 2; unsigned NumDstEltsPerLane = NumDstElts / NumLanes; unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; APInt Undefs(NumDstElts, 0); SmallVector Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt)); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0); auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0); if (UndefElts[SrcIdx]) { Undefs.setBit(Lane * NumDstEltsPerLane + Elt); continue; } APInt &Val = EltBits[SrcIdx]; if (IsSigned) { // PACKSS: Truncate signed value with signed saturation. // Source values less than dst minint are saturated to minint. // Source values greater than dst maxint are saturated to maxint. if (Val.isSignedIntN(DstBitsPerElt)) Val = Val.trunc(DstBitsPerElt); else if (Val.isNegative()) Val = APInt::getSignedMinValue(DstBitsPerElt); else Val = APInt::getSignedMaxValue(DstBitsPerElt); } else { // PACKUS: Truncate signed value with unsigned saturation. // Source values less than zero are saturated to zero. // Source values greater than dst maxuint are saturated to maxuint. if (Val.isIntN(DstBitsPerElt)) Val = Val.trunc(DstBitsPerElt); else if (Val.isNegative()) Val = APInt::getNullValue(DstBitsPerElt); else Val = APInt::getAllOnesValue(DstBitsPerElt); } Bits[Lane * NumDstEltsPerLane + Elt] = Val; } } return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N)); } // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular // truncate to create a larger truncate. if (Subtarget.hasAVX512() && N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 && N0.getOperand(0).getValueType() == MVT::v8i32) { if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) || (!IsSigned && DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) { if (Subtarget.hasVLX()) return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0)); // Widen input to v16i32 so we can truncate that. SDLoc dl(N); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32, N0.getOperand(0), DAG.getUNDEF(MVT::v8i32)); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat); } } // Attempt to combine as shuffle. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; return SDValue(); } static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Shift zero -> zero. if (ISD::isBuildVectorAllZeros(N0.getNode())) return DAG.getConstant(0, SDLoc(N), VT); // Detect constant shift amounts. APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) { unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false); return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0, EltBits[0].getZExtValue(), DAG); } APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, KnownZero, DCI)) return SDValue(N, 0); return SDValue(); } static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"); bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type"); assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type"); // Out of range logical bit shifts are guaranteed to be zero. // Out of range arithmetic bit shifts splat the sign bit. unsigned ShiftVal = cast(N1)->getZExtValue(); if (ShiftVal >= NumBitsPerElt) { if (LogicalShift) return DAG.getConstant(0, SDLoc(N), VT); else ShiftVal = NumBitsPerElt - 1; } // Shift N0 by zero -> N0. if (!ShiftVal) return N0; // Shift zero -> zero. if (ISD::isBuildVectorAllZeros(N0.getNode())) return DAG.getConstant(0, SDLoc(N), VT); // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2) // clamped to (NumBitsPerElt - 1). if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) { unsigned ShiftVal2 = cast(N0.getOperand(1))->getZExtValue(); unsigned NewShiftVal = ShiftVal + ShiftVal2; if (NewShiftVal >= NumBitsPerElt) NewShiftVal = NumBitsPerElt - 1; return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0), DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8)); } // We can decode 'whole byte' logical bit shifts as shuffles. if (LogicalShift && (ShiftVal % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } // Constant Folding. APInt UndefElts; SmallVector EltBits; if (N->isOnlyUserOf(N0.getNode()) && getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) { assert(EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"); for (APInt &Elt : EltBits) { if (X86ISD::VSHLI == Opcode) Elt <<= ShiftVal; else if (X86ISD::VSRAI == Opcode) Elt.ashrInPlace(ShiftVal); else Elt.lshrInPlace(ShiftVal); } return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(NumBitsPerElt), DCI)) return SDValue(N, 0); return SDValue(); } static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && "Unexpected vector insertion"); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(NumBitsPerElt), DCI)) return SDValue(N, 0); // Attempt to combine PINSRB/PINSRW patterns to a shuffle. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; return SDValue(); } /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for /// OR -> CMPNEQSS. static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned opcode; // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0.getOperand(1); SDValue CMP1 = N1.getOperand(1); SDLoc DL(N); // The SETCCs should both refer to the same CMP. if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) return SDValue(); SDValue CMP00 = CMP0->getOperand(0); SDValue CMP01 = CMP0->getOperand(1); EVT VT = CMP00.getValueType(); if (VT == MVT::f32 || VT == MVT::f64) { bool ExpectingFlags = false; // Check for any users that want flags: for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); !ExpectingFlags && UI != UE; ++UI) switch (UI->getOpcode()) { default: case ISD::BR_CC: case ISD::BRCOND: case ISD::SELECT: ExpectingFlags = true; break; case ISD::CopyToReg: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: break; } if (!ExpectingFlags) { enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { X86::CondCode tmp = cc0; cc0 = cc1; cc1 = tmp; } if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { // FIXME: need symbolic constants for these magic numbers. // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget.hasAVX512()) { SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1, DAG.getConstant(0, DL, MVT::v16i1), FSetCC, DAG.getIntPtrConstant(0, DL)); return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL, N->getSimpleValueType(0)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); bool is64BitFP = (CMP00.getValueType() == MVT::f64); MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; if (is64BitFP && !Subtarget.is64Bit()) { // On a 32-bit target, we cannot bitcast the 64-bit float to a // 64-bit integer, since that's not a legal type. Since // OnesOrZeroesF is all ones of all zeroes, we don't need all the // bits, but can do this little dance to extract the lowest 32 bits // and work with those going forward. SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, OnesOrZeroesF); SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64); OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32, DAG.getIntPtrConstant(0, DL)); IntVT = MVT::i32; } SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, DAG.getConstant(1, DL, IntVT)); SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); return OneBitOfTruth; } } } } return SDValue(); } // Match (xor X, -1) -> X. // Match extract_subvector(xor X, -1) -> extract_subvector(X). // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { V = peekThroughBitcasts(V); if (V.getOpcode() == ISD::XOR && ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) return V.getOperand(0); if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), Not, V.getOperand(1)); } } SmallVector CatOps; if (collectConcatOps(V.getNode(), CatOps)) { for (SDValue &CatOp : CatOps) { SDValue NotCat = IsNOT(CatOp, DAG); if (!NotCat) return SDValue(); CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); } return SDValue(); } /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND); MVT VT = N->getSimpleValueType(0); if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); SDValue X, Y; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); if (SDValue Not = IsNOT(N0, DAG)) { X = Not; Y = N1; } else if (SDValue Not = IsNOT(N1, DAG)) { X = Not; Y = N0; } else return SDValue(); X = DAG.getBitcast(VT, X); Y = DAG.getBitcast(VT, Y); return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y); } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized // register. In most cases we actually compare or select YMM-sized registers // and mixing the two types creates horrible code. This method optimizes // some of the transition sequences. // Even with AVX-512 this is still useful for removing casts around logical // operations on vXi1 mask types. static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); assert(VT.isVector() && "Expected vector type"); assert((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); SDValue Narrow = N->getOperand(0); EVT NarrowVT = Narrow.getValueType(); if (Narrow->getOpcode() != ISD::XOR && Narrow->getOpcode() != ISD::AND && Narrow->getOpcode() != ISD::OR) return SDValue(); SDValue N0 = Narrow->getOperand(0); SDValue N1 = Narrow->getOperand(1); SDLoc DL(Narrow); // The Left side has to be a trunc. if (N0.getOpcode() != ISD::TRUNCATE) return SDValue(); // The type of the truncated inputs. if (N0.getOperand(0).getValueType() != VT) return SDValue(); // The right side has to be a 'trunc' or a constant vector. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getValueType() == VT; if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT)) return SDValue(); // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0.getOperand(0); if (RHSTrunc) N1 = N1.getOperand(0); else N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); // Generate the wide operation. SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1); unsigned Opcode = N->getOpcode(); switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType()); case ISD::SIGN_EXTEND: return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(NarrowVT)); } } /// If both input operands of a logic op are being cast from floating point /// types, try to convert this into a floating point logic node to avoid /// unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N10 = N1.getOperand(0); EVT N00Type = N00.getValueType(); EVT N10Type = N10.getValueType(); // Ensure that both types are the same and are legal scalar fp types. if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) || (Subtarget.hasSSE2() && N00Type == MVT::f64))) return SDValue(); unsigned FPOpcode; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected input node for FP logic conversion"); case ISD::AND: FPOpcode = X86ISD::FAND; break; case ISD::OR: FPOpcode = X86ISD::FOR; break; case ISD::XOR: FPOpcode = X86ISD::FXOR; break; } SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); return DAG.getBitcast(VT, FPLogic); } /// If this is a zero/all-bits result that is bitwise-anded with a low bits /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and' /// with a shift-right to eliminate loading the vector constant mask value. static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); EVT VT0 = Op0.getValueType(); EVT VT1 = Op1.getValueType(); if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger()) return SDValue(); APInt SplatVal; if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || !SplatVal.isMask()) return SDValue(); // Don't prevent creation of ANDN. if (isBitwiseNot(Op0)) return SDValue(); if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL)) return SDValue(); unsigned EltBitWidth = VT0.getScalarSizeInBits(); if (EltBitWidth != DAG.ComputeNumSignBits(Op0)) return SDValue(); SDLoc DL(N); unsigned ShiftVal = SplatVal.countTrailingOnes(); SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8); SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); return DAG.getBitcast(N->getValueType(0), Shift); } // Get the index node from the lowered DAG of a GEP IR instruction with one // indexing dimension. static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) { if (Ld->isIndexed()) return SDValue(); SDValue Base = Ld->getBasePtr(); if (Base.getOpcode() != ISD::ADD) return SDValue(); SDValue ShiftedIndex = Base.getOperand(0); if (ShiftedIndex.getOpcode() != ISD::SHL) return SDValue(); return ShiftedIndex.getOperand(0); } static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) { if (Subtarget.hasBMI2() && VT.isScalarInteger()) { switch (VT.getSizeInBits()) { default: return false; case 64: return Subtarget.is64Bit() ? true : false; case 32: return true; } } return false; } // This function recognizes cases where X86 bzhi instruction can replace and // 'and-load' sequence. // In case of loading integer value from an array of constants which is defined // as follows: // // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1} // // then applying a bitwise and on the result with another input. // It's equivalent to performing bzhi (zero high bits) on the input, with the // same index of the load. static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Node->getSimpleValueType(0); SDLoc dl(Node); // Check if subtarget has BZHI instruction for the node's type if (!hasBZHI(Subtarget, VT)) return SDValue(); // Try matching the pattern for both operands. for (unsigned i = 0; i < 2; i++) { SDValue N = Node->getOperand(i); LoadSDNode *Ld = dyn_cast(N.getNode()); // continue if the operand is not a load instruction if (!Ld) return SDValue(); const Value *MemOp = Ld->getMemOperand()->getValue(); if (!MemOp) return SDValue(); if (const GetElementPtrInst *GEP = dyn_cast(MemOp)) { if (GlobalVariable *GV = dyn_cast(GEP->getOperand(0))) { if (GV->isConstant() && GV->hasDefinitiveInitializer()) { Constant *Init = GV->getInitializer(); Type *Ty = Init->getType(); if (!isa(Init) || !Ty->getArrayElementType()->isIntegerTy() || Ty->getArrayElementType()->getScalarSizeInBits() != VT.getSizeInBits() || Ty->getArrayNumElements() > Ty->getArrayElementType()->getScalarSizeInBits()) continue; // Check if the array's constant elements are suitable to our case. uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); bool ConstantsMatch = true; for (uint64_t j = 0; j < ArrayElementCount; j++) { ConstantInt *Elem = dyn_cast(Init->getAggregateElement(j)); if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) { ConstantsMatch = false; break; } } if (!ConstantsMatch) continue; // Do the transformation (For 32-bit type): // -> (and (load arr[idx]), inp) // <- (and (srl 0xFFFFFFFF, (sub 32, idx))) // that will be replaced with one bzhi instruction. SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0); SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32); // Get the Node which indexes into the array. SDValue Index = getIndexFromUnindexedLoad(Ld); if (!Index) return SDValue(); Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32); SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index); Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub); SDValue AllOnes = DAG.getAllOnesConstant(dl, VT); SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub); return DAG.getNode(ISD::AND, dl, VT, Inp, LShr); } } } } return SDValue(); } // Look for (and (ctpop X), 1) which is the IR form of __builtin_parity. // Turn it into series of XORs and a setnp. static SDValue combineParity(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); // We only support 64-bit and 32-bit. 64-bit requires special handling // unless the 64-bit popcnt instruction is legal. if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT)) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // LHS needs to be a single use CTPOP. if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse()) return SDValue(); // RHS needs to be 1. if (!isOneConstant(N1)) return SDValue(); SDLoc DL(N); SDValue X = N0.getOperand(0); // If this is 64-bit, its always best to xor the two 32-bit pieces together // even if we have popcnt. if (VT == MVT::i64) { SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, DAG.getNode(ISD::SRL, DL, VT, X, DAG.getConstant(32, DL, MVT::i8))); SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi); // Generate a 32-bit parity idiom. This will bring us back here if we need // to expand it too. SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32, DAG.getNode(ISD::CTPOP, DL, MVT::i32, X), DAG.getConstant(1, DL, MVT::i32)); return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity); } assert(VT == MVT::i32 && "Unexpected VT!"); // Xor the high and low 16-bits together using a 32-bit operation. SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X, DAG.getConstant(16, DL, MVT::i8)); X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16); // Finally xor the low 2 bytes together and use a 8-bit flag setting xor. // This should allow an h-reg to be used to save a shift. // FIXME: We only get an h-reg in 32-bit mode. SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, DAG.getNode(ISD::SRL, DL, VT, X, DAG.getConstant(8, DL, MVT::i8))); SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32); SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1); // Copy the inverse of the parity flag into a register with setcc. SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); // Zero extend to original type. return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); } static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); // If this is SSE1 only convert to FAND to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast( MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32, DAG.getBitcast(MVT::v4f32, N->getOperand(0)), DAG.getBitcast(MVT::v4f32, N->getOperand(1)))); } // Use a 32-bit and+zext if upper bits known zero. if (VT == MVT::i64 && Subtarget.is64Bit() && !isa(N->getOperand(1))) { APInt HiMask = APInt::getHighBitsSet(64, 32); if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) || DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) { SDLoc dl(N); SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0)); SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1)); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS)); } } // This must be done before legalization has expanded the ctpop. if (SDValue V = combineParity(N, DAG, Subtarget)) return V; // Match all-of bool scalar reductions into a bitcast/movmsk + cmp. // TODO: Support multiple SrcOps. if (VT == MVT::i1) { SmallVector SrcOps; if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) && SrcOps.size() == 1) { SDLoc dl(N); unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); if (Mask) { APInt AllBits = APInt::getAllOnesValue(NumElts); return DAG.getSetCC(dl, MVT::i1, Mask, DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ); } } } if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) return R; if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) return R; // Attempt to recursively combine a bitmask AND with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } // Attempt to combine a scalar bitmask AND with an extracted shuffle. if ((VT.getScalarSizeInBits() % 8) == 0 && N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && isa(N->getOperand(0).getOperand(1))) { SDValue BitMask = N->getOperand(1); SDValue SrcVec = N->getOperand(0).getOperand(0); EVT SrcVecVT = SrcVec.getValueType(); // Check that the constant bitmask masks whole bytes. APInt UndefElts; SmallVector EltBits; if (VT == SrcVecVT.getScalarType() && N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) && getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) && llvm::all_of(EltBits, [](APInt M) { return M.isNullValue() || M.isAllOnesValue(); })) { unsigned NumElts = SrcVecVT.getVectorNumElements(); unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8; unsigned Idx = N->getOperand(0).getConstantOperandVal(1); // Create a root shuffle mask from the byte mask and the extracted index. SmallVector ShuffleMask(NumElts * Scale, SM_SentinelUndef); for (unsigned i = 0; i != Scale; ++i) { if (UndefElts[i]) continue; int VecIdx = Scale * Idx + i; ShuffleMask[VecIdx] = EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx; } if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, N->getOperand(0).getOperand(1)); } } return SDValue(); } // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y)) static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); EVT VT = N->getValueType(0); if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0) return SDValue(); SDValue N0 = peekThroughBitcasts(N->getOperand(0)); SDValue N1 = peekThroughBitcasts(N->getOperand(1)); if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) return SDValue(); // On XOP we'll lower to PCMOV so accept one use, otherwise only // do this if either mask has multiple uses already. if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse())) return SDValue(); // Attempt to extract constant byte masks. APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0, false, false)) return SDValue(); if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1, false, false)) return SDValue(); for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) { // TODO - add UNDEF elts support. if (UndefElts0[i] || UndefElts1[i]) return SDValue(); if (EltBits0[i] != ~EltBits1[i]) return SDValue(); } SDLoc DL(N); SDValue X = N->getOperand(0); SDValue Y = DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)), DAG.getBitcast(VT, N1.getOperand(0))); return DAG.getNode(ISD::OR, DL, VT, X, Y); } // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern. static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { if (N->getOpcode() != ISD::OR) return false; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Canonicalize AND to LHS. if (N1.getOpcode() == ISD::AND) std::swap(N0, N1); // Attempt to match OR(AND(M,Y),ANDNP(M,X)). if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) return false; Mask = N1.getOperand(0); X = N1.getOperand(1); // Check to see if the mask appeared in both the AND and ANDNP. if (N0.getOperand(0) == Mask) Y = N0.getOperand(1); else if (N0.getOperand(1) == Mask) Y = N0.getOperand(0); else return false; // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for // ANDNP combine allows other combines to happen that prevent matching. return true; } // Try to match: // (or (and (M, (sub 0, X)), (pandn M, X))) // which is a special case of vselect: // (vselect M, (sub 0, X), X) // Per: // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate // We know that, if fNegate is 0 or 1: // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) // // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) // ( M ? -X : X) == ((X ^ M ) + (M & 1)) // This lets us transform our vselect to: // (add (xor X, M), (and M, 1)) // And further to: // (sub (xor X, M), M) static SDValue combineLogicBlendIntoConditionalNegate( EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT MaskVT = Mask.getValueType(); assert(MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"); if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) return SDValue(); if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) return SDValue(); auto IsNegV = [](SDNode *N, SDValue V) { return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); }; SDValue V; if (IsNegV(Y.getNode(), X)) V = X; else if (IsNegV(X.getNode(), Y)) V = Y; else return SDValue(); SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); SDValue SubOp2 = Mask; // If the negate was on the false side of the select, then // the operands of the SUB need to be swapped. PR 27251. // This is because the pattern being matched above is // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) // but if the pattern matched was // (vselect M, X, (sub (0, X))), that is really negation of the pattern // above, -(vselect M, (sub 0, X), X), and therefore the replacement // pattern also needs to be a negation of the replacement pattern above. // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the // sub accomplishes the negation of the replacement pattern. if (V == Y) std::swap(SubOp1, SubOp2); SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); return DAG.getBitcast(VT, Res); } // Try to fold: // (or (and (m, y), (pandn m, x))) // into: // (vselect m, x, y) // As a special case, try to fold: // (or (and (m, (sub 0, x)), (pandn m, x))) // into: // (sub (xor X, M), M) static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); EVT VT = N->getValueType(0); if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || (VT.is256BitVector() && Subtarget.hasInt256()))) return SDValue(); SDValue X, Y, Mask; if (!matchLogicBlend(N, X, Y, Mask)) return SDValue(); // Validate that X, Y, and Mask are bitcasts, and see through them. Mask = peekThroughBitcasts(Mask); X = peekThroughBitcasts(X); Y = peekThroughBitcasts(Y); EVT MaskVT = Mask.getValueType(); unsigned EltBits = MaskVT.getScalarSizeInBits(); // TODO: Attempt to handle floating point cases as well? if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits) return SDValue(); SDLoc DL(N); // Attempt to combine to conditional negate: (sub (xor X, M), M) if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL, DAG, Subtarget)) return Res; // PBLENDVB is only available on SSE 4.1. if (!Subtarget.hasSSE41()) return SDValue(); MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); Mask = DAG.getBitcast(BlendVT, Mask); Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X); return DAG.getBitcast(VT, Mask); } // Helper function for combineOrCmpEqZeroToCtlzSrl // Transforms: // seteq(cmp x, 0) // into: // srl(ctlz x), log2(bitsize(x)) // Input pattern is checked by caller. static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy, SelectionDAG &DAG) { SDValue Cmp = Op.getOperand(1); EVT VT = Cmp.getOperand(0).getValueType(); unsigned Log2b = Log2_32(VT.getSizeInBits()); SDLoc dl(Op); SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0)); // The result of the shift is true or false, and on X86, the 32-bit // encoding of shr and lzcnt is more desirable. SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32); SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc, DAG.getConstant(Log2b, dl, MVT::i8)); return DAG.getZExtOrTrunc(Scc, dl, ExtTy); } // Try to transform: // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0)))) // into: // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x)) // Will also attempt to match more generic cases, eg: // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0))) // Only applies if the target supports the FastLZCNT feature. static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast()) return SDValue(); auto isORCandidate = [](SDValue N) { return (N->getOpcode() == ISD::OR && N->hasOneUse()); }; // Check the zero extend is extending to 32-bit or more. The code generated by // srl(ctlz) for 16-bit or less variants of the pattern would require extra // instructions to clear the upper bits. if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) || !isORCandidate(N->getOperand(0))) return SDValue(); // Check the node matches: setcc(eq, cmp 0) auto isSetCCCandidate = [](SDValue N) { return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() && X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E && N->getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(N->getOperand(1).getOperand(1)) && N->getOperand(1).getValueType().bitsGE(MVT::i32); }; SDNode *OR = N->getOperand(0).getNode(); SDValue LHS = OR->getOperand(0); SDValue RHS = OR->getOperand(1); // Save nodes matching or(or, setcc(eq, cmp 0)). SmallVector ORNodes; while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) || (isORCandidate(RHS) && isSetCCCandidate(LHS)))) { ORNodes.push_back(OR); OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode(); LHS = OR->getOperand(0); RHS = OR->getOperand(1); } // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)). if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) || !isORCandidate(SDValue(OR, 0))) return SDValue(); // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it // to // or(srl(ctlz),srl(ctlz)). // The dag combiner can then fold it into: // srl(or(ctlz, ctlz)). EVT VT = OR->getValueType(0); SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG); SDValue Ret, NewRHS; if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG))) Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS); if (!Ret) return SDValue(); // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern. while (ORNodes.size() > 0) { OR = ORNodes.pop_back_val(); LHS = OR->getOperand(0); RHS = OR->getOperand(1); // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). if (RHS->getOpcode() == ISD::OR) std::swap(LHS, RHS); NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); if (!NewRHS) return SDValue(); Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS); } if (Ret) Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); return Ret; } static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // If this is SSE1 only convert to FOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast(MVT::v4i32, DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32, DAG.getBitcast(MVT::v4f32, N0), DAG.getBitcast(MVT::v4f32, N1))); } if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) return R; if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; // Attempt to recursively combine an OR of shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); unsigned Bits = VT.getScalarSizeInBits(); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions // have higher latencies and we are not optimizing for size. if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) return SDValue(); if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); SDValue ShAmt0 = N0.getOperand(1); if (ShAmt0.getValueType() != MVT::i8) return SDValue(); SDValue ShAmt1 = N1.getOperand(1); if (ShAmt1.getValueType() != MVT::i8) return SDValue(); // Peek through any modulo shift masks. SDValue ShMsk0; if (ShAmt0.getOpcode() == ISD::AND && isa(ShAmt0.getOperand(1)) && ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) { ShMsk0 = ShAmt0; ShAmt0 = ShAmt0.getOperand(0); } SDValue ShMsk1; if (ShAmt1.getOpcode() == ISD::AND && isa(ShAmt1.getOperand(1)) && ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) { ShMsk1 = ShAmt1; ShAmt1 = ShAmt1.getOperand(0); } if (ShAmt0.getOpcode() == ISD::TRUNCATE) ShAmt0 = ShAmt0.getOperand(0); if (ShAmt1.getOpcode() == ISD::TRUNCATE) ShAmt1 = ShAmt1.getOperand(0); SDLoc DL(N); unsigned Opc = ISD::FSHL; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) { Opc = ISD::FSHR; std::swap(Op0, Op1); std::swap(ShAmt0, ShAmt1); std::swap(ShMsk0, ShMsk1); } auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1, SDValue Amt) { if (Opc == ISD::FSHR) std::swap(Op0, Op1); return DAG.getNode(Opc, DL, VT, Op0, Op1, DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt)); }; // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C ) // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C ) // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C ) // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C ) // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C ) // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C ) if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); if (auto *SumC = dyn_cast(Sum)) { SDValue ShAmt1Op1 = ShAmt1.getOperand(1); if (ShAmt1Op1.getOpcode() == ISD::AND && isa(ShAmt1Op1.getOperand(1)) && ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) { ShMsk1 = ShAmt1Op1; ShAmt1Op1 = ShAmt1Op1.getOperand(0); } if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE) ShAmt1Op1 = ShAmt1Op1.getOperand(0); if ((SumC->getAPIntValue() == Bits || (SumC->getAPIntValue() == 0 && ShMsk1)) && ShAmt1Op1 == ShAmt0) return GetFunnelShift(Op0, Op1, ShAmt0); } } else if (auto *ShAmt1C = dyn_cast(ShAmt1)) { auto *ShAmt0C = dyn_cast(ShAmt0); if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits) return GetFunnelShift(Op0, Op1, ShAmt0); } else if (ShAmt1.getOpcode() == ISD::XOR) { SDValue Mask = ShAmt1.getOperand(1); if (auto *MaskC = dyn_cast(Mask)) { unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL); SDValue ShAmt1Op0 = ShAmt1.getOperand(0); if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE) ShAmt1Op0 = ShAmt1Op0.getOperand(0); if (MaskC->getSExtValue() == (Bits - 1) && (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) { if (Op1.getOpcode() == InnerShift && isa(Op1.getOperand(1)) && Op1.getConstantOperandAPInt(1) == 1) { return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD && Op1.getOperand(0) == Op1.getOperand(1)) { return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } } } } return SDValue(); } /// Try to turn tests against the signbit in the form of: /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) /// into: /// SETGT(X, -1) static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { // This is only worth doing if the output type is i8 or i1. EVT ResultType = N->getValueType(0); if (ResultType != MVT::i8 && ResultType != MVT::i1) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // We should be performing an xor against a truncated shift. if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) return SDValue(); // Make sure we are performing an xor against one. if (!isOneConstant(N1)) return SDValue(); // SetCC on x86 zero extends so only act on this if it's a logical shift. SDValue Shift = N0.getOperand(0); if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) return SDValue(); // Make sure we are truncating from one of i16, i32 or i64. EVT ShiftTy = Shift.getValueType(); if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) return SDValue(); // Make sure the shift amount extracts the sign bit. if (!isa(Shift.getOperand(1)) || Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. // N.B. Using SETGE against 0 works but we want a canonical looking // comparison, using SETGT matches up with what TranslateX86CC. SDLoc DL(N); SDValue ShiftOp = Shift.getOperand(0); EVT ShiftOpTy = ShiftOp.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ResultType); SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp, DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); if (SetCCResultType != ResultType) Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond); return Cond; } /// Turn vector tests of the signbit in the form of: /// xor (sra X, elt_size(X)-1), -1 /// into: /// pcmpgt X, -1 /// /// This should be called before type legalization because the pattern may not /// persist after that. static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (!VT.isSimple()) return SDValue(); switch (VT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break; case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break; case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break; } // There must be a shift right algebraic before the xor, and the xor must be a // 'not' operation. SDValue Shift = N->getOperand(0); SDValue Ones = N->getOperand(1); if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() || !ISD::isBuildVectorAllOnes(Ones.getNode())) return SDValue(); // The shift should be smearing the sign bit across each vector element. auto *ShiftAmt = isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true); if (!ShiftAmt || ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. We don't use the more obvious // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction. return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); } /// Check if truncation with saturation form type \p SrcVT to \p DstVT /// is valid for the given \p Subtarget. static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX512()) return false; // FIXME: Scalar type may be supported if we move it to vector register. if (!SrcVT.isVector()) return false; EVT SrcElVT = SrcVT.getScalarType(); EVT DstElVT = DstVT.getScalarType(); if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32) return false; if (SrcVT.is512BitVector() || Subtarget.hasVLX()) return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI(); return false; } /// Detect patterns of truncation with unsigned saturation: /// /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). /// Return the source value x to be truncated or SDValue() if the pattern was /// not matched. /// /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type), /// where C1 >= 0 and C2 is unsigned max of destination type. /// /// (truncate (smax (smin (x, C2), C1)) to dest_type) /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2. /// /// These two patterns are equivalent to: /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type) /// So return the smax(x, C1) value to be truncated or SDValue() if the /// pattern was not matched. static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL) { EVT InVT = In.getValueType(); // Saturation with truncation. We truncate from InVT to VT. assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && "Unexpected types for truncate operation"); // Match min/max and return limit value as a parameter. auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue { if (V.getOpcode() == Opcode && ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit)) return V.getOperand(0); return SDValue(); }; APInt C1, C2; if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2)) // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according // the element size of the destination type. if (C2.isMask(VT.getScalarSizeInBits())) return UMin; if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2)) if (MatchMinMax(SMin, ISD::SMAX, C1)) if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits())) return SMin; if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1)) if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2)) if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1)) { return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1)); } return SDValue(); } /// Detect patterns of truncation with signed saturation: /// (truncate (smin ((smax (x, signed_min_of_dest_type)), /// signed_max_of_dest_type)) to dest_type) /// or: /// (truncate (smax ((smin (x, signed_max_of_dest_type)), /// signed_min_of_dest_type)) to dest_type). /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type]. /// Return the source value to be truncated or SDValue() if the pattern was not /// matched. static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) { unsigned NumDstBits = VT.getScalarSizeInBits(); unsigned NumSrcBits = In.getScalarValueSizeInBits(); assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation"); auto MatchMinMax = [](SDValue V, unsigned Opcode, const APInt &Limit) -> SDValue { APInt C; if (V.getOpcode() == Opcode && ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit) return V.getOperand(0); return SDValue(); }; APInt SignedMax, SignedMin; if (MatchPackUS) { SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits); SignedMin = APInt(NumSrcBits, 0); } else { SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits); SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits); } if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax)) if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin)) return SMax; if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin)) if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax)) return SMin; return SDValue(); } /// Detect a pattern of truncation with signed saturation. /// The types should allow to use VPMOVSS* instruction on AVX512. /// Return the source value to be truncated or SDValue() if the pattern was not /// matched. static SDValue detectAVX512SSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget, const TargetLowering &TLI) { if (!TLI.isTypeLegal(In.getValueType())) return SDValue(); if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) return SDValue(); return detectSSatPattern(In, VT); } /// Detect a pattern of truncation with saturation: /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). /// The types should allow to use VPMOVUS* instruction on AVX512. /// Return the source value to be truncated or SDValue() if the pattern was not /// matched. static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget, const TargetLowering &TLI) { if (!TLI.isTypeLegal(In.getValueType())) return SDValue(); if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) return SDValue(); return detectUSatPattern(In, VT, DAG, DL); } static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT SVT = VT.getScalarType(); EVT InVT = In.getValueType(); EVT InSVT = InVT.getScalarType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) && isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) { if (auto SSatVal = detectSSatPattern(In, VT)) return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); } if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && !Subtarget.hasAVX512() && (SVT == MVT::i8 || SVT == MVT::i16) && (InSVT == MVT::i16 || InSVT == MVT::i32)) { if (auto USatVal = detectSSatPattern(In, VT, true)) { // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW). if (SVT == MVT::i8 && InSVT == MVT::i32) { EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL, DAG, Subtarget); if (Mid) return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, Subtarget); } else if (SVT == MVT::i8 || Subtarget.hasSSE41()) return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG, Subtarget); } if (auto SSatVal = detectSSatPattern(In, VT)) return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG, Subtarget); } return SDValue(); } /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient /// X86ISD::AVG instruction. static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { if (!VT.isVector()) return SDValue(); EVT InVT = In.getValueType(); unsigned NumElems = VT.getVectorNumElements(); EVT ScalarVT = VT.getVectorElementType(); if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2 && isPowerOf2_32(NumElems))) return SDValue(); // InScalarVT is the intermediate type in AVG pattern and it should be greater // than the original input type (i8/i16). EVT InScalarVT = InVT.getVectorElementType(); if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) return SDValue(); if (!Subtarget.hasSSE2()) return SDValue(); // Detect the following pattern: // // %1 = zext %a to // %2 = zext %b to // %3 = add nuw nsw %1, // %4 = add nuw nsw %3, %2 // %5 = lshr %N, // %6 = trunc %5 to // // In AVX512, the last instruction can also be a trunc store. if (In.getOpcode() != ISD::SRL) return SDValue(); // A lambda checking the given SDValue is a constant vector and each element // is in the range [Min, Max]. auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { BuildVectorSDNode *BV = dyn_cast(V); if (!BV || !BV->isConstant()) return false; for (SDValue Op : V->ops()) { ConstantSDNode *C = dyn_cast(Op); if (!C) return false; const APInt &Val = C->getAPIntValue(); if (Val.ult(Min) || Val.ugt(Max)) return false; } return true; }; // Check if each element of the vector is left-shifted by one. auto LHS = In.getOperand(0); auto RHS = In.getOperand(1); if (!IsConstVectorInRange(RHS, 1, 1)) return SDValue(); if (LHS.getOpcode() != ISD::ADD) return SDValue(); // Detect a pattern of a + b + 1 where the order doesn't matter. SDValue Operands[3]; Operands[0] = LHS.getOperand(0); Operands[1] = LHS.getOperand(1); auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops); }; // Take care of the case when one of the operands is a constant vector whose // element is in the range [1, 256]. if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && Operands[0].getOpcode() == ISD::ZERO_EXTEND && Operands[0].getOperand(0).getValueType() == VT) { // The pattern is detected. Subtract one from the constant vector, then // demote it and emit X86ISD::AVG instruction. SDValue VecOnes = DAG.getConstant(1, DL, InVT); Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); return SplitOpsAndApply(DAG, Subtarget, DL, VT, { Operands[0].getOperand(0), Operands[1] }, AVGBuilder); } // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)). // Match the or case only if its 'add-like' - can be replaced by an add. auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) { if (ISD::ADD == V.getOpcode()) { Op0 = V.getOperand(0); Op1 = V.getOperand(1); return true; } if (ISD::ZERO_EXTEND != V.getOpcode()) return false; V = V.getOperand(0); if (V.getValueType() != VT || ISD::OR != V.getOpcode() || !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1))) return false; Op0 = V.getOperand(0); Op1 = V.getOperand(1); return true; }; SDValue Op0, Op1; if (FindAddLike(Operands[0], Op0, Op1)) std::swap(Operands[0], Operands[1]); else if (!FindAddLike(Operands[1], Op0, Op1)) return SDValue(); Operands[2] = Op0; Operands[1] = Op1; // Now we have three operands of two additions. Check that one of them is a // constant vector with ones, and the other two can be promoted from i8/i16. for (int i = 0; i < 3; ++i) { if (!IsConstVectorInRange(Operands[i], 1, 1)) continue; std::swap(Operands[i], Operands[2]); // Check if Operands[0] and Operands[1] are results of type promotion. for (int j = 0; j < 2; ++j) if (Operands[j].getValueType() != VT) { if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || Operands[j].getOperand(0).getValueType() != VT) return SDValue(); Operands[j] = Operands[j].getOperand(0); } // The pattern is detected, emit X86ISD::AVG instruction(s). return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]}, AVGBuilder); } return SDValue(); } static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { LoadSDNode *Ld = cast(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // For chips with slow 32-byte unaligned loads, break the 32-byte operation // into two 16-byte operations. Also split non-temporal aligned loads on // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. ISD::LoadExtType Ext = Ld->getExtensionType(); bool Fast; unsigned Alignment = Ld->getAlignment(); if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) || (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, *Ld->getMemOperand(), &Fast) && !Fast))) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); unsigned HalfAlign = 16; SDValue Ptr1 = Ld->getBasePtr(); SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems / 2); SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(), Alignment, Ld->getMemOperand()->getFlags()); SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2, Ld->getPointerInfo().getWithOffset(HalfAlign), MinAlign(Alignment, HalfAlign), Ld->getMemOperand()->getFlags()); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2); return DCI.CombineTo(N, NewVec, TF, true); } // Bool vector load - attempt to cast to an integer, as we have good // (vXiY *ext(vXi1 bitcast(iX))) handling. if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() && RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) { unsigned NumElts = RegVT.getVectorNumElements(); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); if (TLI.isTypeLegal(IntVT)) { SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Alignment, Ld->getMemOperand()->getFlags()); SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad); return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true); } } return SDValue(); } /// If V is a build vector of boolean constants and exactly one of those /// constants is true, return the operand index of that true element. /// Otherwise, return -1. static int getOneTrueElt(SDValue V) { // This needs to be a build vector of booleans. // TODO: Checking for the i1 type matches the IR definition for the mask, // but the mask check could be loosened to i8 or other types. That might // also require checking more than 'allOnesValue'; eg, the x86 HW // instructions only require that the MSB is set for each mask element. // The ISD::MSTORE comments/definition do not specify how the mask operand // is formatted. auto *BV = dyn_cast(V); if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1) return -1; int TrueIndex = -1; unsigned NumElts = BV->getValueType(0).getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { const SDValue &Op = BV->getOperand(i); if (Op.isUndef()) continue; auto *ConstNode = dyn_cast(Op); if (!ConstNode) return -1; if (ConstNode->getAPIntValue().isAllOnesValue()) { // If we already found a one, this is too many. if (TrueIndex >= 0) return -1; TrueIndex = i; } } return TrueIndex; } /// Given a masked memory load/store operation, return true if it has one mask /// bit set. If it has one mask bit set, then also return the memory address of /// the scalar element to load/store, the vector index to insert/extract that /// scalar element, and the alignment for the scalar memory access. static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, unsigned &Alignment) { int TrueMaskElt = getOneTrueElt(MaskedOp->getMask()); if (TrueMaskElt < 0) return false; // Get the address of the one scalar element that is specified by the mask // using the appropriate offset from the base pointer. EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType(); Addr = MaskedOp->getBasePtr(); if (TrueMaskElt != 0) { unsigned Offset = TrueMaskElt * EltVT.getStoreSize(); Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp)); } Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp)); Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize()); return true; } /// If exactly one element of the mask is set for a non-extending masked load, /// it is a scalar load and vector insert. /// Note: It is expected that the degenerate cases of an all-zeros or all-ones /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; unsigned Alignment; if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment)) return SDValue(); // Load the one scalar element that is specified by the mask using the // appropriate offset from the base pointer. SDLoc DL(ML); EVT VT = ML->getValueType(0); EVT EltVT = VT.getVectorElementType(); SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(), Alignment, ML->getMemOperand()->getFlags()); // Insert the loaded element into the appropriate place in the vector. SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getPassThru(), Load, VecIndex); return DCI.CombineTo(ML, Insert, Load.getValue(1), true); } static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode())) return SDValue(); SDLoc DL(ML); EVT VT = ML->getValueType(0); // If we are loading the first and last elements of a vector, it is safe and // always faster to load the whole vector. Replace the masked load with a // vector load and select. unsigned NumElts = VT.getVectorNumElements(); BuildVectorSDNode *MaskBV = cast(ML->getMask()); bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0)); bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1)); if (LoadFirstElt && LoadLastElt) { SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(), ML->getMemOperand()); SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getPassThru()); return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true); } // Convert a masked load with a constant mask into a masked load and a select. // This allows the select operation to use a faster kind of select instruction // (for example, vblendvps -> vblendps). // Don't try this if the pass-through operand is already undefined. That would // cause an infinite loop because that's what we're about to create. if (ML->getPassThru().isUndef()) return SDValue(); if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode())) return SDValue(); // The new masked load has an undef pass-through operand. The select uses the // original pass-through operand. SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), ML->getMask(), DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(), ML->getExtensionType()); SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getPassThru()); return DCI.CombineTo(ML, Blend, NewML.getValue(1), true); } static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MaskedLoadSDNode *Mld = cast(N); // TODO: Expanding load with constant mask may be optimized as well. if (Mld->isExpandingLoad()) return SDValue(); if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI)) return ScalarLoad; // TODO: Do some AVX512 subsets benefit from this transform? if (!Subtarget.hasAVX512()) if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI)) return Blend; } if (Mld->getExtensionType() != ISD::EXTLOAD) return SDValue(); // Resolve extending loads. EVT VT = Mld->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); EVT LdVT = Mld->getMemoryVT(); SDLoc dl(Mld); assert(LdVT != VT && "Cannot extend to the same type"); unsigned ToSz = VT.getScalarSizeInBits(); unsigned FromSz = LdVT.getScalarSizeInBits(); // From/To sizes and ElemCount must be pow of two. assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load"); unsigned SizeRatio = ToSz / FromSz; assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), LdVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); // Convert PassThru value. SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru()); if (!Mld->getPassThru().isUndef()) { SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal"); WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru, DAG.getUNDEF(WideVecVT), ShuffleVec); } // Prepare the new mask. SDValue NewMask; SDValue Mask = Mld->getMask(); if (Mask.getValueType() == VT) { // Mask and original value have the same type. NewMask = DAG.getBitcast(WideVecVT, Mask); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), ShuffleVec); } else { assert(Mask.getValueType().getVectorElementType() == MVT::i1); unsigned WidenNumElts = NumElems*SizeRatio; unsigned MaskNumElts = VT.getVectorNumElements(); EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, WidenNumElts); unsigned NumConcat = WidenNumElts / MaskNumElts; SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); SmallVector Ops(NumConcat, ZeroVal); Ops[0] = Mask; NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), Mld->getBasePtr(), NewMask, WidePassThru, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i * SizeRatio] = i; // Can't shuffle using an illegal type. assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal"); SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), ShuffleVec); SlicedVec = DAG.getBitcast(VT, SlicedVec); return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); } /// If exactly one element of the mask is set for a non-truncating masked store, /// it is a vector extract and scalar store. /// Note: It is expected that the degenerate cases of an all-zeros or all-ones /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG) { // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; unsigned Alignment; if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment)) return SDValue(); // Extract the one scalar element that is actually being stored. SDLoc DL(MS); EVT VT = MS->getValue().getValueType(); EVT EltVT = VT.getVectorElementType(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, MS->getValue(), VecIndex); // Store that element at the appropriate offset from the base pointer. return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(), Alignment, MS->getMemOperand()->getFlags()); } static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MaskedStoreSDNode *Mst = cast(N); if (Mst->isCompressingStore()) return SDValue(); EVT VT = Mst->getValue().getValueType(); EVT StVT = Mst->getMemoryVT(); SDLoc dl(Mst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!Mst->isTruncatingStore()) { if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) return ScalarStore; // If the mask value has been legalized to a non-boolean vector, try to // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mst->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) return SDValue(N, 0); } // TODO: AVX512 targets should also be able to simplify something like the // pattern above, but that pattern will be different. It will either need to // match setcc more generally or match PCMPGTM later (in tablegen?). SDValue Value = Mst->getValue(); if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), Mst->getMemoryVT())) { return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), Mst->getBasePtr(), Mask, Mst->getMemoryVT(), Mst->getMemOperand(), true); } return SDValue(); } // Resolve truncating stores. unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getScalarSizeInBits(); unsigned ToSz = StVT.getScalarSizeInBits(); // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. // In this case we don't need any further transformations. if (TLI.isTruncStoreLegal(VT, StVT)) return SDValue(); // From/To sizes and ElemCount must be pow of two. assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. assert (((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"); unsigned SizeRatio = FromSz / ToSz; assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), ShuffleVec); SDValue NewMask; SDValue Mask = Mst->getMask(); if (Mask.getValueType() == VT) { // Mask and original value have the same type. NewMask = DAG.getBitcast(WideVecVT, Mask); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) ShuffleVec[i] = NumElems*SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), ShuffleVec); } else { assert(Mask.getValueType().getVectorElementType() == MVT::i1); unsigned WidenNumElts = NumElems*SizeRatio; unsigned MaskNumElts = VT.getVectorNumElements(); EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, WidenNumElts); unsigned NumConcat = WidenNumElts / MaskNumElts; SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); SmallVector Ops(NumConcat, ZeroVal); Ops[0] = Mask; NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), NewMask, StVT, Mst->getMemOperand(), false); } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { StoreSDNode *St = cast(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); unsigned Alignment = St->getAlignment(); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Convert a store of vXi1 into a store of iX and a bitcast. if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() && VT.getVectorElementType() == MVT::i1) { EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements()); StoredVal = DAG.getBitcast(NewVT, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } // If this is a store of a scalar_to_vector to v1i1, just use a scalar store. // This will avoid a copy to k-register. if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() && StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR && StoredVal.getOperand(0).getValueType() == MVT::i8) { return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0), St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } // Widen v2i1/v4i1 stores to v8i1. if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT && Subtarget.hasAVX512()) { unsigned NumConcats = 8 / VT.getVectorNumElements(); SmallVector Ops(NumConcats, DAG.getUNDEF(VT)); Ops[0] = StoredVal; StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } // Turn vXi1 stores of constants into a scalar store. if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 || VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) && ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) { // If its a v64i1 store without 64-bit support, we need two stores. if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(0, 32)); Lo = combinevXi1ConstantToInteger(Lo, DAG); SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(32, 32)); Hi = combinevXi1ConstantToInteger(Hi, DAG); SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl); SDValue Ch0 = DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(), Alignment, St->getMemOperand()->getFlags()); SDValue Ch1 = DAG.getStore(St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4), MinAlign(Alignment, 4U), St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. bool Fast; if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, *St->getMemOperand(), &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); return splitVectorStore(St, DAG); } // Split under-aligned vector non-temporal stores. if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) { // ZMM/YMM nt-stores - either it can be stored as a series of shorter // vectors or the legalizer can scalarize it to use MOVNTI. if (VT.is256BitVector() || VT.is512BitVector()) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); return splitVectorStore(St, DAG); } // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64 // to use MOVNTI. if (VT.is128BitVector() && Subtarget.hasSSE2()) { MVT NTVT = Subtarget.hasSSE4A() ? MVT::v2f64 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32); return scalarizeVectorStore(St, NTVT, DAG); } } // Try to optimize v16i16->v16i8 truncating stores when BWI is not // supported, but avx512f is by extending to v16i32 and truncating. if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() && St->getValue().getOpcode() == ISD::TRUNCATE && St->getValue().getOperand(0).getValueType() == MVT::v16i16 && TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) && !DCI.isBeforeLegalizeOps()) { SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue()); return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), MVT::v16i8, St->getMemOperand()); } // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { // Check if we can detect an AVG pattern from the truncation. If yes, // replace the trunc store by a normal store with the result of X86ISD::AVG // instruction. if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl)) return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); if (SDValue Val = detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget, TLI)) return EmitTruncSStore(true /* Signed saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), DAG, dl, Subtarget, TLI)) return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getScalarSizeInBits(); unsigned ToSz = StVT.getScalarSizeInBits(); // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. // In this case we don't need any further transformations. if (TLI.isTruncStoreLegalOrCustom(VT, StVT)) return SDValue(); // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. if (0 != (NumElems * FromSz) % ToSz) return SDValue(); unsigned SizeRatio = FromSz / ToSz; assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), ShuffleVec); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. // Find the largest store unit MVT StoreType = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) StoreType = Tp; } // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && (64 <= NumElems * ToSz)) StoreType = MVT::f64; // Bitcast the original vector into a vector of store-size units EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); SmallVector Chains; SDValue Ptr = St->getBasePtr(); // Perform one or more big stores into memory. for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StoreType, ShuffWide, DAG.getIntPtrConstant(i, dl)); SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl); Chains.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. if (VT.getSizeInBits() != 64) return SDValue(); const Function &F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); if (((VT.isVector() && !VT.isFloatingPoint()) || (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && isa(St->getValue()) && !cast(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { LoadSDNode *Ld = cast(St->getValue().getNode()); SmallVector Ops; if (!ISD::isNormalLoad(Ld)) return SDValue(); // If this is not the MMX case, i.e. we are just turning i64 load/store // into f64 load/store, avoid the transformation if there are multiple // uses of the loaded value. if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) return SDValue(); SDLoc LdDL(Ld); SDLoc StDL(N); // If we are a 64-bit capable x86, lower to a single movq load/store pair. // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget.is64Bit() || F64IsLegal) { MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); // Make sure new load is placed in same chain order. DAG.makeEquivalentMemoryOrdering(Ld, NewLd); return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(), St->getMemOperand()); } // Otherwise, lower to two pairs of 32-bit loads / stores. SDValue LoAddr = Ld->getBasePtr(); SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL); SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, Ld->getPointerInfo().getWithOffset(4), MinAlign(Ld->getAlignment(), 4), Ld->getMemOperand()->getFlags()); // Make sure new loads are placed in same chain order. DAG.makeEquivalentMemoryOrdering(Ld, LoLd); DAG.makeEquivalentMemoryOrdering(Ld, HiLd); LoAddr = St->getBasePtr(); HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL); SDValue LoSt = DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4), MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } // This is similar to the above case, but here we handle a scalar 64-bit // integer store that is extracted from a vector on a 32-bit target. // If we have SSE2, then we can treat it like a floating-point double // to get past legalization. The execution dependencies fixup pass will // choose the optimal machine instruction for the store if this really is // an integer or v2f32 rather than an f64. if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() && St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue OldExtract = St->getOperand(1); SDValue ExtOp0 = OldExtract.getOperand(0); unsigned VecSize = ExtOp0.getValueSizeInBits(); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0); SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, BitCast, OldExtract.getOperand(1)); return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } return SDValue(); } /// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements /// of its first operand, then on successive elements of its second operand, /// returning the resulting values in a vector. For example, if /// A = < float a0, float a1, float a2, float a3 > /// and /// B = < float b0, float b1, float b2, float b3 > /// then the result of doing a horizontal operation on A and B is /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; // Look for the following pattern: // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > // and // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); unsigned NumElts = VT.getVectorNumElements(); // TODO - can we make a general helper method that does all of this for us? auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1, SmallVectorImpl &ShuffleMask) { if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) { if (!Op.getOperand(0).isUndef()) N0 = Op.getOperand(0); if (!Op.getOperand(1).isUndef()) N1 = Op.getOperand(1); ArrayRef Mask = cast(Op)->getMask(); ShuffleMask.append(Mask.begin(), Mask.end()); return; } bool UseSubVector = false; if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && Op.getOperand(0).getValueType().is256BitVector() && llvm::isNullConstant(Op.getOperand(1))) { Op = Op.getOperand(0); UseSubVector = true; } bool IsUnary; SmallVector SrcOps; SmallVector SrcShuffleMask; SDValue BC = peekThroughBitcasts(Op); if (isTargetShuffle(BC.getOpcode()) && getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false, SrcOps, SrcShuffleMask, IsUnary)) { if (!UseSubVector && SrcShuffleMask.size() == NumElts && SrcOps.size() <= 2) { N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue(); N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue(); ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end()); } if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) && SrcOps.size() == 1) { N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op)); N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op)); ArrayRef Mask = ArrayRef(SrcShuffleMask).slice(0, NumElts); ShuffleMask.append(Mask.begin(), Mask.end()); } } }; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle, then pretend it is the identity shuffle: // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: A default initialized SDValue represents an UNDEF of type VT. SDValue A, B; SmallVector LMask; GetShuffle(LHS, A, B, LMask); // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; SmallVector RMask; GetShuffle(RHS, C, D, RMask); // At least one of the operands should be a vector shuffle. unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1); if (NumShuffles == 0) return false; if (LMask.empty()) { A = LHS; for (unsigned i = 0; i != NumElts; ++i) LMask.push_back(i); } if (RMask.empty()) { C = RHS; for (unsigned i = 0; i != NumElts; ++i) RMask.push_back(i); } // If A and B occur in reverse order in RHS, then canonicalize by commuting // RHS operands and shuffle mask. if (A != C) { std::swap(C, D); ShuffleVectorSDNode::commuteMask(RMask); } // Check that the shuffles are both shuffling the same vectors. if (!(A == C && B == D)) return false; // LHS and RHS are now: // LHS = shuffle A, B, LMask // RHS = shuffle A, B, RMask // Check that the masks correspond to performing a horizontal operation. // AVX defines horizontal add/sub to operate independently on 128-bit lanes, // so we just repeat the inner loop if this is a 256-bit op. unsigned Num128BitChunks = VT.getSizeInBits() / 128; unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; assert((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"); for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) { // Ignore undefined components. int LIdx = LMask[i + j], RIdx = RMask[i + j]; if (LIdx < 0 || RIdx < 0 || (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; // The low half of the 128-bit result must choose from A. // The high half of the 128-bit result must choose from B, // unless B is undef. In that case, we are always choosing from A. unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j; if (!(LIdx == Index && RIdx == Index + 1) && !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) return false; LHS = DAG.getBitcast(VT, LHS); RHS = DAG.getBitcast(VT, RHS); return true; } /// Do target-specific dag combines on floating-point adds/subs. static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); bool IsFadd = N->getOpcode() == ISD::FADD; auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); // Try to synthesize horizontal add/sub from adds/subs of shuffles. if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); return SDValue(); } /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify /// the codegen. /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove /// anything that is guaranteed to be transformed by DAGCombiner. static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"); SDValue Src = N->getOperand(0); unsigned SrcOpcode = Src.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); auto IsFreeTruncation = [VT](SDValue Op) { unsigned TruncSizeInBits = VT.getScalarSizeInBits(); // See if this has been extended from a smaller/equal size to // the truncation size, allowing a truncation to combine with the extend. unsigned Opcode = Op.getOpcode(); if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND) && Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) return true; // See if this is a single use constant which can be constant folded. // NOTE: We don't peek throught bitcasts here because there is currently // no support for constant folding truncate+bitcast+vector_of_constants. So // we'll just send up with a truncate on both operands which will // get turned back into (truncate (binop)) causing an infinite loop. return ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); }; auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0); SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1); return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1); }; // Don't combine if the operation has other uses. if (!Src.hasOneUse()) return SDValue(); // Only support vector truncation for now. // TODO: i64 scalar math would benefit as well. if (!VT.isVector()) return SDValue(); // In most cases its only worth pre-truncating if we're only facing the cost // of one truncation. // i.e. if one of the inputs will constant fold or the input is repeated. switch (SrcOpcode) { case ISD::AND: case ISD::XOR: case ISD::OR: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) && (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; } case ISD::MUL: // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its // better to truncate if we have the chance. if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(SrcOpcode, VT) && !TLI.isOperationLegal(SrcOpcode, SrcVT)) return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); LLVM_FALLTHROUGH; case ISD::ADD: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(SrcOpcode, VT) && (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; } case ISD::SUB: { // TODO: ISD::SUB We are conservative and require both sides to be freely // truncatable to avoid interfering with combineSubToSubus. SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(SrcOpcode, VT) && (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1)))) return TruncateArithmetic(Op0, Op1); break; } } return SDValue(); } /// Truncate using ISD::AND mask and X86ISD::PACKUS. /// e.g. trunc <8 x i32> X to <8 x i16> --> /// MaskX = X & 0xffff (clear high bits to prevent saturation) /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1) static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); EVT OutVT = N->getValueType(0); APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(), OutVT.getScalarSizeInBits()); In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT)); return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget); } /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); EVT OutVT = N->getValueType(0); In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In, DAG.getValueType(OutVT)); return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget); } /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type /// legalization the truncation will be translated into a BUILD_VECTOR with each /// element that is extracted from a vector and then truncated, and it is /// difficult to do this optimization based on them. static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT OutVT = N->getValueType(0); if (!OutVT.isVector()) return SDValue(); SDValue In = N->getOperand(0); if (!In.getValueType().isSimple()) return SDValue(); EVT InVT = In.getValueType(); unsigned NumElems = OutVT.getVectorNumElements(); // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on // SSE2, and we need to take care of it specially. // AVX512 provides vpmovdb. if (!Subtarget.hasSSE2() || Subtarget.hasAVX2()) return SDValue(); EVT OutSVT = OutVT.getVectorElementType(); EVT InSVT = InVT.getVectorElementType(); if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && NumElems >= 8)) return SDValue(); // SSSE3's pshufb results in less instructions in the cases below. if (Subtarget.hasSSSE3() && NumElems == 8 && ((OutSVT == MVT::i8 && InSVT != MVT::i64) || (InSVT == MVT::i32 && OutSVT == MVT::i16))) return SDValue(); SDLoc DL(N); // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to // truncate 2 x v4i32 to v8i16. if (Subtarget.hasSSE41() || OutSVT == MVT::i8) return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG); if (InSVT == MVT::i32) return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG); return SDValue(); } /// This function transforms vector truncation of 'extended sign-bits' or /// 'extended zero-bits' values. /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Requires SSE2 but AVX512 has fast truncate. if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) return SDValue(); if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple()) return SDValue(); SDValue In = N->getOperand(0); if (!In.getValueType().isSimple()) return SDValue(); MVT VT = N->getValueType(0).getSimpleVT(); MVT SVT = VT.getScalarType(); MVT InVT = In.getValueType().getSimpleVT(); MVT InSVT = InVT.getScalarType(); // Check we have a truncation suited for PACKSS/PACKUS. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32) return SDValue(); if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); unsigned NumPackedSignBits = std::min(SVT.getSizeInBits(), 16); unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; // Use PACKUS if the input has zero-bits that extend all the way to the // packed/truncated value. e.g. masks, zext_in_reg, etc. KnownBits Known = DAG.computeKnownBits(In); unsigned NumLeadingZeroBits = Known.countMinLeadingZeros(); if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits)) return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget); // Use PACKSS if the input has sign-bits that extend all the way to the // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. unsigned NumSignBits = DAG.ComputeNumSignBits(In); if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits)) return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); return SDValue(); } // Try to form a MULHU or MULHS node by looking for // (trunc (srl (mul ext, ext), 16)) // TODO: This is X86 specific because we want to be able to handle wide types // before type legalization. But we can only do it if the vector will be // legalized via widening/splitting. Type legalization can't handle promotion // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG // combiner. static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // First instruction should be a right shift of a multiply. if (Src.getOpcode() != ISD::SRL || Src.getOperand(0).getOpcode() != ISD::MUL) return SDValue(); if (!Subtarget.hasSSE2()) return SDValue(); // Only handle vXi16 types that are at least 128-bits unless they will be // widened. if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 || (!ExperimentalVectorWideningLegalization && VT.getVectorNumElements() < 8)) return SDValue(); // Input type should be vXi32. EVT InVT = Src.getValueType(); if (InVT.getVectorElementType() != MVT::i32) return SDValue(); // Need a shift by 16. APInt ShiftAmt; if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) || ShiftAmt != 16) return SDValue(); SDValue LHS = Src.getOperand(0).getOperand(0); SDValue RHS = Src.getOperand(0).getOperand(1); unsigned ExtOpc = LHS.getOpcode(); if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) || RHS.getOpcode() != ExtOpc) return SDValue(); // Peek through the extends. LHS = LHS.getOperand(0); RHS = RHS.getOperand(0); // Ensure the input types match. if (LHS.getValueType() != VT || RHS.getValueType() != VT) return SDValue(); unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU; return DAG.getNode(Opc, DL, VT, LHS, RHS); } // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes // from one vector with signed bytes from another vector, adds together // adjacent pairs of 16-bit products, and saturates the result before // truncating to 16-bits. // // Which looks something like this: // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))), // (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B)))))))) static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { if (!VT.isVector() || !Subtarget.hasSSSE3()) return SDValue(); unsigned NumElems = VT.getVectorNumElements(); EVT ScalarVT = VT.getVectorElementType(); if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems)) return SDValue(); SDValue SSatVal = detectSSatPattern(In, VT); if (!SSatVal || SSatVal.getOpcode() != ISD::ADD) return SDValue(); // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs // of multiplies from even/odd elements. SDValue N0 = SSatVal.getOperand(0); SDValue N1 = SSatVal.getOperand(1); if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDValue N10 = N1.getOperand(0); SDValue N11 = N1.getOperand(1); // TODO: Handle constant vectors and use knownbits/computenumsignbits? // Canonicalize zero_extend to LHS. if (N01.getOpcode() == ISD::ZERO_EXTEND) std::swap(N00, N01); if (N11.getOpcode() == ISD::ZERO_EXTEND) std::swap(N10, N11); // Ensure we have a zero_extend and a sign_extend. if (N00.getOpcode() != ISD::ZERO_EXTEND || N01.getOpcode() != ISD::SIGN_EXTEND || N10.getOpcode() != ISD::ZERO_EXTEND || N11.getOpcode() != ISD::SIGN_EXTEND) return SDValue(); // Peek through the extends. N00 = N00.getOperand(0); N01 = N01.getOperand(0); N10 = N10.getOperand(0); N11 = N11.getOperand(0); // Ensure the extend is from vXi8. if (N00.getValueType().getVectorElementType() != MVT::i8 || N01.getValueType().getVectorElementType() != MVT::i8 || N10.getValueType().getVectorElementType() != MVT::i8 || N11.getValueType().getVectorElementType() != MVT::i8) return SDValue(); // All inputs should be build_vectors. if (N00.getOpcode() != ISD::BUILD_VECTOR || N01.getOpcode() != ISD::BUILD_VECTOR || N10.getOpcode() != ISD::BUILD_VECTOR || N11.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // N00/N10 are zero extended. N01/N11 are sign extended. // For each element, we need to ensure we have an odd element from one vector // multiplied by the odd element of another vector and the even element from // one of the same vectors being multiplied by the even element from the // other vector. So we need to make sure for each element i, this operator // is being performed: // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1] SDValue ZExtIn, SExtIn; for (unsigned i = 0; i != NumElems; ++i) { SDValue N00Elt = N00.getOperand(i); SDValue N01Elt = N01.getOperand(i); SDValue N10Elt = N10.getOperand(i); SDValue N11Elt = N11.getOperand(i); // TODO: Be more tolerant to undefs. if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); auto *ConstN00Elt = dyn_cast(N00Elt.getOperand(1)); auto *ConstN01Elt = dyn_cast(N01Elt.getOperand(1)); auto *ConstN10Elt = dyn_cast(N10Elt.getOperand(1)); auto *ConstN11Elt = dyn_cast(N11Elt.getOperand(1)); if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt) return SDValue(); unsigned IdxN00 = ConstN00Elt->getZExtValue(); unsigned IdxN01 = ConstN01Elt->getZExtValue(); unsigned IdxN10 = ConstN10Elt->getZExtValue(); unsigned IdxN11 = ConstN11Elt->getZExtValue(); // Add is commutative so indices can be reordered. if (IdxN00 > IdxN10) { std::swap(IdxN00, IdxN10); std::swap(IdxN01, IdxN11); } // N0 indices be the even element. N1 indices must be the next odd element. if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i || IdxN11 != 2 * i + 1) return SDValue(); SDValue N00In = N00Elt.getOperand(0); SDValue N01In = N01Elt.getOperand(0); SDValue N10In = N10Elt.getOperand(0); SDValue N11In = N11Elt.getOperand(0); // First time we find an input capture it. if (!ZExtIn) { ZExtIn = N00In; SExtIn = N01In; } if (ZExtIn != N00In || SExtIn != N01In || ZExtIn != N10In || SExtIn != N11In) return SDValue(); } auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { // Shrink by adding truncate nodes and let DAGCombine fold with the // sources. EVT InVT = Ops[0].getValueType(); assert(InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"); assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, InVT.getVectorNumElements() / 2); return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn }, PMADDBuilder); } static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); // Attempt to pre-truncate inputs to arithmetic ops instead. if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL)) return V; // Try to detect AVG pattern first. if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; // Try to detect PMADD if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL)) return PMAdd; // Try to combine truncation with signed/unsigned saturation. if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget)) return Val; // Try to combine PMULHUW/PMULHW for vXi16. if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget)) return V; // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { SDValue BCSrc = Src.getOperand(0); if (BCSrc.getValueType() == MVT::x86mmx) return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } // Try to truncate extended sign/zero bits with PACKSS/PACKUS. if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget)) return V; return combineVectorTruncation(N, DAG, Subtarget); } /// Returns the negated value if the node \p N flips sign of FP value. /// /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000) /// or FSUB(0, x) /// AVX512F does not have FXOR, so FNEG is lowered as /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))). /// In this case we go though all bitcasts. /// This also recognizes splat of a negated value and returns the splat of that /// value. static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { if (N->getOpcode() == ISD::FNEG) return N->getOperand(0); unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits(); SDValue Op = peekThroughBitcasts(SDValue(N, 0)); EVT VT = Op->getValueType(0); // Make sure the element size does't change. if (VT.getScalarSizeInBits() != ScalarSize) return SDValue(); if (auto SVOp = dyn_cast(Op.getNode())) { // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. if (!SVOp->getOperand(1).isUndef()) return SDValue(); if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode())) if (NegOp0.getValueType() == VT) // FIXME: Can we do better? return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), SVOp->getMask()); return SDValue(); } unsigned Opc = Op.getOpcode(); if (Opc == ISD::INSERT_VECTOR_ELT) { // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF, // -V, INDEX). SDValue InsVector = Op.getOperand(0); SDValue InsVal = Op.getOperand(1); if (!InsVector.isUndef()) return SDValue(); if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode())) if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, NegInsVal, Op.getOperand(2)); return SDValue(); } if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB) return SDValue(); SDValue Op1 = Op.getOperand(1); SDValue Op0 = Op.getOperand(0); // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit // masks. For FSUB, we have to check if constant bits of Op0 are sign bit // masks and hence we swap the operands. if (Opc == ISD::FSUB) std::swap(Op0, Op1); APInt UndefElts; SmallVector EltBits; // Extract constant bits and see if they are all sign bit masks. Ignore the // undef elements. if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits, /* AllowWholeUndefs */ true, /* AllowPartialUndefs */ false)) { for (unsigned I = 0, E = EltBits.size(); I < E; I++) if (!UndefElts[I] && !EltBits[I].isSignMask()) return SDValue(); return peekThroughBitcasts(Op0); } return SDValue(); } /// Do target-specific dag combines on floating point negations. static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT OrigVT = N->getValueType(0); SDValue Arg = isFNEG(DAG, N); if (!Arg) return SDValue(); EVT VT = Arg.getValueType(); EVT SVT = VT.getScalarType(); SDLoc DL(N); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); // If we're negating a FMUL node on a target with FMA, then we can avoid the // use of a constant by performing (-0 - A*B) instead. // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Zero); return DAG.getBitcast(OrigVT, NewNode); } // If we're negating an FMA node, then we can adjust the // instruction to include the extra negation. unsigned NewOpcode = 0; if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) { switch (Arg.getOpcode()) { case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break; case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break; // We can't handle scalar intrinsic node here because it would only // invert one element and not the whole vector. But we could try to handle // a negation of the lower element only. } } if (NewOpcode) return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg.getNode()->ops())); return SDValue(); } static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); // If we have integer vector types available, use the integer opcodes. if (!VT.isVector() || !Subtarget.hasSSE2()) return SDValue(); SDLoc dl(N); unsigned IntBits = VT.getScalarSizeInBits(); MVT IntSVT = MVT::getIntegerVT(IntBits); MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits); SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0)); SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); unsigned IntOpcode; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected FP logic op"); case X86ISD::FOR: IntOpcode = ISD::OR; break; case X86ISD::FXOR: IntOpcode = ISD::XOR; break; case X86ISD::FAND: IntOpcode = ISD::AND; break; case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; } SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); return DAG.getBitcast(VT, IntOp); } /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val) static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() != ISD::XOR) return SDValue(); SDValue LHS = N->getOperand(0); auto *RHSC = dyn_cast(N->getOperand(1)); if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC) return SDValue(); X86::CondCode NewCC = X86::GetOppositeBranchCondition( X86::CondCode(LHS->getConstantOperandVal(0))); SDLoc DL(N); return getSETCC(NewCC, LHS->getOperand(1), DL, DAG); } static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // If this is SSE1 only convert to FXOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && N->getValueType(0) == MVT::v4i32) { return DAG.getBitcast( MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32, DAG.getBitcast(MVT::v4f32, N->getOperand(0)), DAG.getBitcast(MVT::v4f32, N->getOperand(1)))); } if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) return Cmp; if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue SetCC = foldXor1SetCC(N, DAG)) return SetCC; if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) return RV; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; return combineFneg(N, DAG, Subtarget); } static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); EVT VT = N->getValueType(0); unsigned NumBits = VT.getSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // TODO - Constant Folding. if (auto *Cst1 = dyn_cast(Op1)) { // Reduce Cst1 to the bottom 16-bits. // NOTE: SimplifyDemandedBits won't do this for constants. const APInt &Val1 = Cst1->getAPIntValue(); APInt MaskedVal1 = Val1 & 0xFFFF; if (MaskedVal1 != Val1) return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0, DAG.getConstant(MaskedVal1, SDLoc(N), VT)); } // Only bottom 16-bits of the control bits are required. APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16)); if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI)) return SDValue(N, 0); return SDValue(); } static bool isNullFPScalarOrVectorConst(SDValue V) { return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode()); } /// If a value is a scalar FP zero or a vector FP zero (potentially including /// undefined elements), return a zero constant that may be used to fold away /// that value. In the case of a vector, the returned constant will not contain /// undefined elements even if the input parameter does. This makes it suitable /// to be used as a replacement operand with operations (eg, bitwise-and) where /// an undef should not propagate. static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!isNullFPScalarOrVectorConst(V)) return SDValue(); if (V.getValueType().isVector()) return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V)); return V; } static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); // Vector types are handled in combineANDXORWithAllOnesIntoANDNP(). if (!((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::f64 && Subtarget.hasSSE2()) || (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2()))) return SDValue(); auto isAllOnesConstantFP = [](SDValue V) { if (V.getSimpleValueType().isVector()) return ISD::isBuildVectorAllOnes(V.getNode()); auto *C = dyn_cast(V); return C && C->getConstantFPValue()->isAllOnesValue(); }; // fand (fxor X, -1), Y --> fandn X, Y if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1))) return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1); // fand X, (fxor Y, -1) --> fandn Y, X if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1))) return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0); return SDValue(); } /// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FAND(0.0, x) -> 0.0 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget)) return V; // FAND(x, 0.0) -> 0.0 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) return V; if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget)) return V; return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FANDN nodes. static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FANDN(0.0, x) -> x if (isNullFPScalarOrVectorConst(N->getOperand(0))) return N->getOperand(1); // FANDN(x, 0.0) -> 0.0 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) return V; return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x if (isNullFPScalarOrVectorConst(N->getOperand(0))) return N->getOperand(1); // F[X]OR(x, 0.0) -> x if (isNullFPScalarOrVectorConst(N->getOperand(1))) return N->getOperand(0); if (SDValue NewVal = combineFneg(N, DAG, Subtarget)) return NewVal; return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); // Only perform optimizations if UnsafeMath is used. if (!DAG.getTarget().Options.UnsafeFPMath) return SDValue(); // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes // into FMINC and FMAXC, which are Commutative operations. unsigned NewOp = 0; switch (N->getOpcode()) { default: llvm_unreachable("unknown opcode"); case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; } return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); } static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Subtarget.useSoftFloat()) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); if (!((Subtarget.hasSSE1() && VT == MVT::f32) || (Subtarget.hasSSE2() && VT == MVT::f64) || (VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDLoc DL(N); auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; // If we don't have to respect NaN inputs, this is a direct translation to x86 // min/max instructions. if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs()) return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); // If one of the operands is known non-NaN use the native min/max instructions // with the non-NaN input as second operand. if (DAG.isKnownNeverNaN(Op1)) return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); if (DAG.isKnownNeverNaN(Op0)) return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags()); // If we have to respect NaN inputs, this takes at least 3 instructions. // Favor a library call when operating on a scalar and minimizing code size. if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); // There are 4 possibilities involving NaN inputs, and these are the required // outputs: // Op1 // Num NaN // ---------------- // Num | Max | Op0 | // Op0 ---------------- // NaN | Op1 | NaN | // ---------------- // // The SSE FP max/min instructions were not designed for this case, but rather // to implement: // Min = Op1 < Op0 ? Op1 : Op0 // Max = Op1 > Op0 ? Op1 : Op0 // // So they always return Op0 if either input is a NaN. However, we can still // use those instructions for fmaxnum by selecting away a NaN input. // If either operand is NaN, the 2nd source operand (Op0) is passed through. SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO); // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax); } static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt KnownUndef, KnownZero; APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, KnownZero, DCI)) return SDValue(N, 0); // Convert a full vector load into vzload when not all bits are needed. SDValue In = N->getOperand(0); MVT InVT = In.getSimpleValueType(); if (VT.getVectorNumElements() < InVT.getVectorNumElements() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast(N->getOperand(0)); // Unless the load is volatile. if (!LN->isVolatile()) { SDLoc dl(N); unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); MVT MemVT = MVT::getIntegerVT(NumBits); MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue VZLoad = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, LN->getPointerInfo(), LN->getAlignment(), LN->getMemOperand()->getFlags()); SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad)); DCI.CombineTo(N, Convert); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); return SDValue(N, 0); } } return SDValue(); } static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); // Convert a full vector load into vzload when not all bits are needed. SDValue In = N->getOperand(0); MVT InVT = In.getSimpleValueType(); if (VT.getVectorNumElements() < InVT.getVectorNumElements() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast(N->getOperand(0)); // Unless the load is volatile. if (!LN->isVolatile()) { SDLoc dl(N); unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); MVT MemVT = MVT::getFloatingPointVT(NumBits); MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue VZLoad = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, LN->getPointerInfo(), LN->getAlignment(), LN->getMemOperand()->getFlags()); SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad)); DCI.CombineTo(N, Convert); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); return SDValue(N, 0); } } return SDValue(); } /// Do target-specific dag combines on X86ISD::ANDNP nodes. static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); // ANDNP(0, x) -> x if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) return N->getOperand(1); // ANDNP(x, 0) -> 0 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())) return DAG.getConstant(0, SDLoc(N), VT); // Turn ANDNP back to AND if input is inverted. if (SDValue Not = IsNOT(N->getOperand(0), DAG)) return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N->getOperand(1)); // Attempt to recursively combine a bitmask ANDNP with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } return SDValue(); } static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // BT ignores high bits in the bit index operand. unsigned BitWidth = N1.getValueSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask)) return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1); return SDValue(); } // Try to combine sext_in_reg of a cmov of constants by extending the constants. static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); EVT DstVT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast(N1)->getVT(); if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16) return SDValue(); // Look through single use any_extends / truncs. SDValue IntermediateBitwidthOp; if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) && N0.hasOneUse()) { IntermediateBitwidthOp = N0; N0 = N0.getOperand(0); } // See if we have a single use cmov. if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse()) return SDValue(); SDValue CMovOp0 = N0.getOperand(0); SDValue CMovOp1 = N0.getOperand(1); // Make sure both operands are constants. if (!isa(CMovOp0.getNode()) || !isa(CMovOp1.getNode())) return SDValue(); SDLoc DL(N); // If we looked through an any_extend/trunc above, add one to the constants. if (IntermediateBitwidthOp) { unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode(); CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0); CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1); } CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1); CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1); EVT CMovVT = DstVT; // We do not want i16 CMOV's. Promote to i32 and truncate afterwards. if (DstVT == MVT::i16) { CMovVT = MVT::i32; CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0); CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1); } SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1, N0.getOperand(2), N0.getOperand(3)); if (CMovVT != DstVT) CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov); return CMov; } static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); if (SDValue V = combineSextInRegCmov(N, DAG)) return V; EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast(N1)->getVT(); SDLoc dl(N); // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the // both SSE and AVX2 since there is no sign-extended shift right // operation on a vector with 64-bit elements. //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND)) { SDValue N00 = N0.getOperand(0); // EXTLOAD has a better solution on AVX2, // it may be replaced with X86ISD::VSEXT node. if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256()) if (!ISD::isNormalLoad(N00.getNode())) return SDValue(); if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1); return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); } } return SDValue(); } /// sext(add_nsw(x, C)) --> add(sext(x), C_sext) /// zext(add_nuw(x, C)) --> add(zext(x), C_zext) /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes /// opportunities to combine math ops, use an LEA, or use a complex addressing /// mode. This can eliminate extend, add, and shift instructions. static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Ext->getOpcode() != ISD::SIGN_EXTEND && Ext->getOpcode() != ISD::ZERO_EXTEND) return SDValue(); // TODO: This should be valid for other integer types. EVT VT = Ext->getValueType(0); if (VT != MVT::i64) return SDValue(); SDValue Add = Ext->getOperand(0); if (Add.getOpcode() != ISD::ADD) return SDValue(); bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND; bool NSW = Add->getFlags().hasNoSignedWrap(); bool NUW = Add->getFlags().hasNoUnsignedWrap(); // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding // into the 'zext' if ((Sext && !NSW) || (!Sext && !NUW)) return SDValue(); // Having a constant operand to the 'add' ensures that we are not increasing // the instruction count because the constant is extended for free below. // A constant operand can also become the displacement field of an LEA. auto *AddOp1 = dyn_cast(Add.getOperand(1)); if (!AddOp1) return SDValue(); // Don't make the 'add' bigger if there's no hope of combining it with some // other 'add' or 'shl' instruction. // TODO: It may be profitable to generate simpler LEA instructions in place // of single 'add' instructions, but the cost model for selecting an LEA // currently has a high threshold. bool HasLEAPotential = false; for (auto *User : Ext->uses()) { if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { HasLEAPotential = true; break; } } if (!HasLEAPotential) return SDValue(); // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'. int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue(); SDValue AddOp0 = Add.getOperand(0); SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0); SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); // The wider add is guaranteed to not wrap because both operands are // sign-extended. SDNodeFlags Flags; Flags.setNoSignedWrap(NSW); Flags.setNoUnsignedWrap(NUW); return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags); } // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant // operands and the result of CMOV is not used anywhere else - promote CMOV // itself instead of promoting its result. This could be beneficial, because: // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two // (or more) pseudo-CMOVs only when they go one-after-another and // getting rid of result extension code after CMOV will help that. // 2) Promotion of constant CMOV arguments is free, hence the // {ANY,SIGN,ZERO}_EXTEND will just be deleted. // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this // promotion is also good in terms of code-size. // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit // promotion). static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) { SDValue CMovN = Extend->getOperand(0); if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse()) return SDValue(); EVT TargetVT = Extend->getValueType(0); unsigned ExtendOpcode = Extend->getOpcode(); SDLoc DL(Extend); EVT VT = CMovN.getValueType(); SDValue CMovOp0 = CMovN.getOperand(0); SDValue CMovOp1 = CMovN.getOperand(1); if (!isa(CMovOp0.getNode()) || !isa(CMovOp1.getNode())) return SDValue(); // Only extend to i32 or i64. if (TargetVT != MVT::i32 && TargetVT != MVT::i64) return SDValue(); // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32 // are free. if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32)) return SDValue(); // If this a zero extend to i64, we should only extend to i32 and use a free // zero extend to finish. EVT ExtendVT = TargetVT; if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND) ExtendVT = MVT::i32; CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0); CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1); SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1, CMovN.getOperand(2), CMovN.getOperand(3)); // Finish extending if needed. if (ExtendVT != TargetVT) Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res); return Res; } // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)). // This is more or less the reverse of combineBitcastvxi1. static SDValue combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND && Opcode != ISD::ANY_EXTEND) return SDValue(); if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) return SDValue(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); EVT InSVT = N0.getValueType().getScalarType(); unsigned EltSizeInBits = SVT.getSizeInBits(); // Input type must be extending a bool vector (bit-casted from a scalar // integer) to legal integer types. if (!VT.isVector()) return SDValue(); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8) return SDValue(); if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST) return SDValue(); SDValue N00 = N0.getOperand(0); EVT SclVT = N0.getOperand(0).getValueType(); if (!SclVT.isScalarInteger()) return SDValue(); SDLoc DL(N); SDValue Vec; SmallVector ShuffleMask; unsigned NumElts = VT.getVectorNumElements(); assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"); // Broadcast the scalar integer to the vector elements. if (NumElts > EltSizeInBits) { // If the scalar integer is greater than the vector element size, then we // must split it down into sub-sections for broadcasting. For example: // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections. // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections. assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"); unsigned Scale = NumElts / EltSizeInBits; EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); Vec = DAG.getBitcast(VT, Vec); for (unsigned i = 0; i != Scale; ++i) ShuffleMask.append(EltSizeInBits, i); } else { // For smaller scalar integers, we can simply any-extend it to the vector // element size (we don't care about the upper bits) and broadcast it to all // elements. SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); ShuffleMask.append(NumElts, 0); } Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); // Now, mask the relevant bit in each element. SmallVector Bits; for (unsigned i = 0; i != NumElts; ++i) { int BitIdx = (i % EltSizeInBits); APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1); Bits.push_back(DAG.getConstant(Bit, DL, SVT)); } SDValue BitMask = DAG.getBuildVector(VT, DL, Bits); Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask); // Compare against the bitmask and extend the result. EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ); Vec = DAG.getSExtOrTrunc(Vec, DL, VT); // For SEXT, this is now done, otherwise shift the result down for // zero-extension. if (Opcode == ISD::SIGN_EXTEND) return Vec; return DAG.getNode(ISD::SRL, DL, VT, Vec, DAG.getConstant(EltSizeInBits - 1, DL, VT)); } /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating /// with UNDEFs) of the input to vectors of the same size as the target type /// which then extends the lowest elements. static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (ExperimentalVectorWideningLegalization) return SDValue(); unsigned Opcode = N->getOpcode(); // TODO - add ANY_EXTEND support. if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) return SDValue(); if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget.hasSSE2()) return SDValue(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); EVT InVT = N0.getValueType(); EVT InSVT = InVT.getScalarType(); // FIXME: Generic DAGCombiner previously had a bug that would cause a // sign_extend of setcc to sometimes return the original node and tricked it // into thinking CombineTo was used which prevented the target combines from // running. // Earlying out here to avoid regressions like this // (v4i32 (sext (v4i1 (setcc (v4i16))))) // Becomes // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef)))) // Type legalized to // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32))))))) // Leading to a packssdw+pmovsxwd // We could write a DAG combine to fix this, but really we shouldn't be // creating sext_invec that's forcing v8i16 into the DAG. if (N0.getOpcode() == ISD::SETCC) return SDValue(); // Input type must be a vector and we must be extending legal integer types. if (!VT.isVector() || VT.getVectorNumElements() < 2) return SDValue(); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) return SDValue(); if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); // If the input/output types are both legal then we have at least AVX1 and // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly. if (DAG.getTargetLoweringInfo().isTypeLegal(VT) && DAG.getTargetLoweringInfo().isTypeLegal(InVT)) return SDValue(); SDLoc DL(N); auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { EVT SrcVT = N.getValueType(); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), Size / SrcVT.getScalarSizeInBits()); SmallVector Opnds(Size / SrcVT.getSizeInBits(), DAG.getUNDEF(SrcVT)); Opnds[0] = N; return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds); }; // If target-size is less than 128-bits, extend to a type that would extend // to 128 bits, extend that and extract the original target vector. if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) { unsigned Scale = 128 / VT.getSizeInBits(); EVT ExVT = EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, DAG.getIntPtrConstant(0, DL)); } // If target-size is 128-bits (or 256-bits on AVX target), then convert to // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || (VT.is256BitVector() && Subtarget.hasAVX()) || (VT.is512BitVector() && Subtarget.useAVX512Regs())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); return DAG.getNode(Opcode, DL, VT, ExOp); } auto SplitAndExtendInReg = [&](unsigned SplitSize) { unsigned NumVecs = VT.getSizeInBits() / SplitSize; unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode); SmallVector Opnds; for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, DAG.getIntPtrConstant(Offset, DL)); SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec); Opnds.push_back(SrcVec); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); }; // On pre-AVX targets, split into 128-bit nodes of // ISD::*_EXTEND_VECTOR_INREG. if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128)) return SplitAndExtendInReg(128); // On pre-AVX512 targets, split into 256-bit nodes of // ISD::*_EXTEND_VECTOR_INREG. if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256)) return SplitAndExtendInReg(256); return SDValue(); } // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm // result type. static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); SDLoc dl(N); // Only do this combine with AVX512 for vector extends. if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC) return SDValue(); // Only combine legal element types. EVT SVT = VT.getVectorElementType(); if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 && SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64) return SDValue(); // We can only do this if the vector size in 256 bits or less. unsigned Size = VT.getSizeInBits(); if (Size > 256) return SDValue(); // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since // that's the only integer compares with we have. ISD::CondCode CC = cast(N0.getOperand(2))->get(); if (ISD::isUnsignedIntSetCC(CC)) return SDValue(); // Only do this combine if the extension will be fully consumed by the setcc. EVT N00VT = N0.getOperand(0).getValueType(); EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); if (Size != MatchingVecType.getSizeInBits()) return SDValue(); SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC); if (N->getOpcode() == ISD::ZERO_EXTEND) Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType()); return Res; } static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = N0.getValueType(); SDLoc DL(N); if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) return NewCMov; if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR && isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) { // Invert and sign-extend a boolean is the same as zero-extend and subtract // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1. // sext (xor Bool, -1) --> sub (zext Bool), 1 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT)); } if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; if (VT.isVector()) if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget)) return R; if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) return NewAdd; return SDValue(); } static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) { if (NegMul) { switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); case ISD::FMA: Opcode = X86ISD::FNMADD; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FNMADD: Opcode = ISD::FMA; break; case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; } } if (NegAcc) { switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); case ISD::FMA: Opcode = X86ISD::FMSUB; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; case X86ISD::FMSUB: Opcode = ISD::FMA; break; case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; } } return Opcode; } static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); EVT ScalarVT = VT.getScalarType(); if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); auto invertIfNegative = [&DAG](SDValue &V) { if (SDValue NegVal = isFNEG(DAG, V.getNode())) { V = DAG.getBitcast(V.getValueType(), NegVal); return true; } // Look through extract_vector_elts. If it comes from an FNEG, create a // new extract from the FNEG input. if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isNullConstant(V.getOperand(1))) { if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) { NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal); V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(), NegVal, V.getOperand(1)); return true; } } return false; }; // Do not convert the passthru input of scalar intrinsics. // FIXME: We could allow negations of the lower element only. bool NegA = invertIfNegative(A); bool NegB = invertIfNegative(B); bool NegC = invertIfNegative(C); if (!NegA && !NegB && !NegC) return SDValue(); unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC); if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, A, B, C); } // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode()); if (!NegVal) return SDValue(); // FIXME: Should we bitcast instead? if (NegVal.getValueType() != VT) return SDValue(); unsigned NewOpcode; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break; case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break; case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break; case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break; } if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), NegVal, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), NegVal); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because // ISD::SETCC is always legalized to i8. SDLoc dl(N); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { if (!isOneConstant(N0.getOperand(1))) return SDValue(); return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, dl, VT)); } } if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, dl, VT)); } } if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) return NewCMov; if (DCI.isBeforeLegalizeOps()) if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; if (VT.isVector()) if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget)) return R; if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) return NewAdd; if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) return R; // TODO: Combine with any target/faux shuffle. if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 && VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); unsigned NumSrcElts = N00.getValueType().getVectorNumElements(); unsigned NumSrcEltBits = N00.getScalarValueSizeInBits(); APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2); if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) && (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) { return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128); } } return SDValue(); } /// Try to map a 128-bit or larger integer comparison to vector instructions /// before type legalization splits it up into chunks. static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"); // We're looking for an oversized integer equality comparison. SDValue X = SetCC->getOperand(0); SDValue Y = SetCC->getOperand(1); EVT OpVT = X.getValueType(); unsigned OpSize = OpVT.getSizeInBits(); if (!OpVT.isScalarInteger() || OpSize < 128) return SDValue(); // Ignore a comparison with zero because that gets special treatment in // EmitTest(). But make an exception for the special case of a pair of // logically-combined vector-sized operands compared to zero. This pattern may // be generated by the memcmp expansion pass with oversized integer compares // (see PR33325). bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR && X.getOperand(0).getOpcode() == ISD::XOR && X.getOperand(1).getOpcode() == ISD::XOR; if (isNullConstant(Y) && !IsOrXorXorCCZero) return SDValue(); // Don't perform this combine if constructing the vector will be expensive. auto IsVectorBitCastCheap = [](SDValue X) { X = peekThroughBitcasts(X); return isa(X) || X.getValueType().isVector() || X.getOpcode() == ISD::LOAD; }; if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && !IsOrXorXorCCZero) return SDValue(); // TODO: Use PXOR + PTEST for SSE4.1 or later? EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); if ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && Subtarget.hasAVX2()) || (OpSize == 512 && Subtarget.useAVX512Regs())) { EVT VecVT = OpSize == 512 ? MVT::v16i32 : OpSize == 256 ? MVT::v32i8 : MVT::v16i8; EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT; SDValue Cmp; if (IsOrXorXorCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne // Use 2 vector equality compares and 'and' the results before doing a // MOVMSK. SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0)); SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); } else { SDValue VecX = DAG.getBitcast(VecVT, X); SDValue VecY = DAG.getBitcast(VecVT, Y); Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); } // For 512-bits we want to emit a setcc that will lower to kortest. if (OpSize == 512) return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), DAG.getConstant(0xFFFF, DL, MVT::i16), CC); // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL, MVT::i32); return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC); } return SDValue(); } static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { ISD::CondCode CC = cast(N->getOperand(2))->get(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); EVT OpVT = LHS.getValueType(); SDLoc DL(N); if (CC == ISD::SETNE || CC == ISD::SETEQ) { // 0-x == y --> x+y == 0 // 0-x != y --> x+y != 0 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1)); return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } // x == 0-y --> x+y == 0 // x != 0-y --> x+y != 0 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget)) return V; } if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { // Put build_vectors on the right. if (LHS.getOpcode() == ISD::BUILD_VECTOR) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); } bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1); bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); if (IsSEXT0 && IsVZero1) { assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); if (CC == ISD::SETGT) return DAG.getConstant(0, DL, VT); if (CC == ISD::SETLE) return DAG.getConstant(1, DL, VT); if (CC == ISD::SETEQ || CC == ISD::SETGE) return DAG.getNOT(DL, LHS.getOperand(0), VT); assert((CC == ISD::SETNE || CC == ISD::SETLT) && "Unexpected condition code!"); return LHS.getOperand(0); } } // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just // pre-promote its result type since vXi1 vectors don't get promoted // during type legalization. // NOTE: The element count check is to ignore operand types that need to // go through type promotion to a 128-bit vector. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() && VT.getVectorElementType() == MVT::i1 && (ExperimentalVectorWideningLegalization || VT.getVectorNumElements() > 4) && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16)) { SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS, N->getOperand(2)); return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc); } // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early // to avoid scalarization via legalization because v4i32 is not a legal type. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 && LHS.getValueType() == MVT::v4f32) return LowerVSETCC(SDValue(N, 0), Subtarget, DAG); return SDValue(); } static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = N->getSimpleValueType(0); unsigned NumBits = VT.getScalarSizeInBits(); unsigned NumElts = SrcVT.getVectorNumElements(); // Perform constant folding. if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) { assert(VT == MVT::i32 && "Unexpected result type"); APInt Imm(32, 0); for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) { if (!Src.getOperand(Idx).isUndef() && Src.getConstantOperandAPInt(Idx).isNegative()) Imm.setBit(Idx); } return DAG.getConstant(Imm, SDLoc(N), VT); } // Look through int->fp bitcasts that don't change the element width. unsigned EltWidth = SrcVT.getScalarSizeInBits(); if (Src.getOpcode() == ISD::BITCAST && Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results // with scalar comparisons. if (SDValue NotSrc = IsNOT(Src, DAG)) { SDLoc DL(N); APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); NotSrc = DAG.getBitcast(SrcVT, NotSrc); return DAG.getNode(ISD::XOR, DL, VT, DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc), DAG.getConstant(NotMask, DL, VT)); } // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getAllOnesValue(NumBits)); if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); return SDValue(); } static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); if (DCI.isBeforeLegalizeOps()) { SDValue Index = N->getOperand(4); // Remove any sign extends from 32 or smaller to larger than 32. // Only do this before LegalizeOps in case we need the sign extend for // legalization. if (Index.getOpcode() == ISD::SIGN_EXTEND) { if (Index.getScalarValueSizeInBits() > 32 && Index.getOperand(0).getScalarValueSizeInBits() <= 32) { SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[4] = Index.getOperand(0); SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); if (Res == N) { // The original sign extend has less users, add back to worklist in // case it needs to be removed DCI.AddToWorklist(Index.getNode()); DCI.AddToWorklist(N); } return SDValue(Res, 0); } } // Make sure the index is either i32 or i64 unsigned ScalarSize = Index.getScalarValueSizeInBits(); if (ScalarSize != 32 && ScalarSize != 64) { MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32; EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT, Index.getValueType().getVectorNumElements()); Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[4] = Index; SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); if (Res == N) DCI.AddToWorklist(N); return SDValue(Res, 0); } // Try to remove zero extends from 32->64 if we know the sign bit of // the input is zero. if (Index.getOpcode() == ISD::ZERO_EXTEND && Index.getScalarValueSizeInBits() == 64 && Index.getOperand(0).getScalarValueSizeInBits() == 32) { if (DAG.SignBitIsZero(Index.getOperand(0))) { SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[4] = Index.getOperand(0); SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); if (Res == N) { // The original sign extend has less users, add back to worklist in // case it needs to be removed DCI.AddToWorklist(Index.getNode()); DCI.AddToWorklist(N); } return SDValue(Res, 0); } } } // With AVX2 we only demand the upper bit of the mask. if (!Subtarget.hasAVX512()) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Mask = N->getOperand(2); APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) return SDValue(N, 0); } return SDValue(); } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); // Try to simplify the EFLAGS and condition code operands. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) return getSETCC(CC, Flags, DL, DAG); return SDValue(); } /// Optimize branch condition evaluation. static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue EFLAGS = N->getOperand(3); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); // Try to simplify the EFLAGS and condition code operands. // Make sure to not keep references to operands, as combineSetCCEFLAGS can // RAUW them under us. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), N->getOperand(1), Cond, Flags); } return SDValue(); } static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to // optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> // AND(VECTOR_CMP(x,y), constant2) // constant2 = UNARYOP(constant) // Early exit if this isn't a vector operation, the operand of the // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (auto *BV = dyn_cast(N->getOperand(0).getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. SDLoc DL(N); EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, N->getOperand(0)->getOperand(0), MaskConst); SDValue Res = DAG.getBitcast(VT, NewAnd); return Res; } return SDValue(); } /// If we are converting a value to floating-point, try to replace scalar /// truncate of an extracted vector element with a bitcast. This tries to keep /// the sequence on XMM registers rather than moving between vector and GPRs. static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) { // TODO: This is currently only used by combineSIntToFP, but it is generalized // to allow being called by any similar cast opcode. // TODO: Consider merging this into lowering: vectorizeExtractedCast(). SDValue Trunc = N->getOperand(0); if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE) return SDValue(); SDValue ExtElt = Trunc.getOperand(0); if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isNullConstant(ExtElt.getOperand(1))) return SDValue(); EVT TruncVT = Trunc.getValueType(); EVT SrcVT = ExtElt.getValueType(); unsigned DestWidth = TruncVT.getSizeInBits(); unsigned SrcWidth = SrcVT.getSizeInBits(); if (SrcWidth % DestWidth != 0) return SDValue(); // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0) EVT SrcVecVT = ExtElt.getOperand(0).getValueType(); unsigned VecWidth = SrcVecVT.getSizeInBits(); unsigned NumElts = VecWidth / DestWidth; EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts); SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0)); SDLoc DL(N); SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT, BitcastVec, ExtElt.getOperand(1)); return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt); } static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32)) // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP. return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. if (DAG.SignBitIsZero(Op0)) return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0); return SDValue(); } static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG)) return Res; // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32)) // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Without AVX512DQ we only support i64 to float scalar conversion. For both // vectors and scalars, see if we know that the upper bits are all the sign // bit, in which case we can truncate the input to i32 and convert from that. if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) { unsigned BitWidth = InVT.getScalarSizeInBits(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op0); if (NumSignBits >= (BitWidth - 31)) { EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32); if (InVT.isVector()) TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, InVT.getVectorNumElements()); SDLoc dl(N); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); } } // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (!Subtarget.useSoftFloat() && Subtarget.hasX87() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast(Op0.getNode()); EVT LdVT = Ld->getValueType(0); // This transformation is not supported if the result type is f16 or f128. if (VT == MVT::f16 || VT == MVT::f128) return SDValue(); // If we have AVX512DQ we can use packed conversion instructions unless // the VT is f80. if (Subtarget.hasDQI() && VT != MVT::f80) return SDValue(); if (!Ld->isVolatile() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !Subtarget.is64Bit() && LdVT == MVT::i64) { SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD( SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; } } if (SDValue V = combineToFPTruncExtElt(N, DAG)) return V; return SDValue(); } static bool needCarryOrOverflowFlag(SDValue Flags) { assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!"); for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); UI != UE; ++UI) { SDNode *User = *UI; X86::CondCode CC; switch (User->getOpcode()) { default: // Be conservative. return true; case X86ISD::SETCC: case X86ISD::SETCC_CARRY: CC = (X86::CondCode)User->getConstantOperandVal(0); break; case X86ISD::BRCOND: CC = (X86::CondCode)User->getConstantOperandVal(2); break; case X86ISD::CMOV: CC = (X86::CondCode)User->getConstantOperandVal(2); break; } switch (CC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: case X86::COND_O: case X86::COND_NO: case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: return true; } } return false; } static bool onlyZeroFlagUsed(SDValue Flags) { assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!"); for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); UI != UE; ++UI) { SDNode *User = *UI; unsigned CCOpNo; switch (User->getOpcode()) { default: // Be conservative. return false; case X86ISD::SETCC: CCOpNo = 0; break; case X86ISD::SETCC_CARRY: CCOpNo = 0; break; case X86ISD::BRCOND: CCOpNo = 2; break; case X86ISD::CMOV: CCOpNo = 2; break; } X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo); if (CC != X86::COND_E && CC != X86::COND_NE) return false; } return true; } static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { // Only handle test patterns. if (!isNullConstant(N->getOperand(1))) return SDValue(); // If we have a CMP of a truncated binop, see if we can make a smaller binop // and use its flags directly. // TODO: Maybe we should try promoting compares that only use the zero flag // first if we can prove the upper bits with computeKnownBits? SDLoc dl(N); SDValue Op = N->getOperand(0); EVT VT = Op.getValueType(); // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) && Op.hasOneUse() && isa(Op.getOperand(1)) && onlyZeroFlagUsed(SDValue(N, 0))) { unsigned BitWidth = VT.getSizeInBits(); const APInt &ShAmt = Op.getConstantOperandAPInt(1); if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts. unsigned MaskBits = BitWidth - ShAmt.getZExtValue(); APInt Mask = Op.getOpcode() == ISD::SRL ? APInt::getHighBitsSet(BitWidth, MaskBits) : APInt::getLowBitsSet(BitWidth, MaskBits); if (Mask.isSignedIntN(32)) { Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), DAG.getConstant(Mask, dl, VT)); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, VT)); } } } // Look for a truncate with a single use. if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse()) return SDValue(); Op = Op.getOperand(0); // Arithmetic op can only have one use. if (!Op.hasOneUse()) return SDValue(); unsigned NewOpc; switch (Op.getOpcode()) { default: return SDValue(); case ISD::AND: // Skip and with constant. We have special handling for and with immediate // during isel to generate test instructions. if (isa(Op.getOperand(1))) return SDValue(); NewOpc = X86ISD::AND; break; case ISD::OR: NewOpc = X86ISD::OR; break; case ISD::XOR: NewOpc = X86ISD::XOR; break; case ISD::ADD: // If the carry or overflow flag is used, we can't truncate. if (needCarryOrOverflowFlag(SDValue(N, 0))) return SDValue(); NewOpc = X86ISD::ADD; break; case ISD::SUB: // If the carry or overflow flag is used, we can't truncate. if (needCarryOrOverflowFlag(SDValue(N, 0))) return SDValue(); NewOpc = X86ISD::SUB; break; } // We found an op we can narrow. Truncate its inputs. SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0)); SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1)); // Use a X86 specific opcode to avoid DAG combine messing with it. SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1); // For AND, keep a CMP so that we can match the test pattern. if (NewOpc == X86ISD::AND) return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, VT)); // Return the flags. return Op.getValue(1); } static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"); SDLoc DL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); MVT VT = LHS.getSimpleValueType(); unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB; // If we don't use the flag result, simplify back to a generic ADD/SUB. if (!N->hasAnyUseOfValue(1)) { SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS); return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL); } // Fold any similar generic ADD/SUB opcodes to reuse this node. auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { SDValue Ops[] = {N0, N1}; SDVTList VTs = DAG.getVTList(N->getValueType(0)); if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { SDValue Op(N, 0); if (Negate) Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); DCI.CombineTo(GenericAddSub, Op); } }; MatchGeneric(LHS, RHS, false); MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); return SDValue(); } static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, N->getOperand(0), N->getOperand(1), Flags); } // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry) // iff the flag result is dead. SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) && !N->hasAnyUseOfValue(1)) return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0), Op0.getOperand(1), N->getOperand(2)); return SDValue(); } // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // If the LHS and RHS of the ADC node are zero, then it can't overflow and // the result is either zero or one (depending on the input carry bit). // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. if (X86::isZeroNode(N->getOperand(0)) && X86::isZeroNode(N->getOperand(1)) && // We don't have a good way to replace an EFLAGS use, so only do this when // dead right now. SDValue(N, 1).use_empty()) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), N->getOperand(2)), DAG.getConstant(1, DL, VT)); return DCI.CombineTo(N, Res1, CarryOut); } if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, N->getOperand(0), N->getOperand(1), Flags); } return SDValue(); } /// If this is an add or subtract where one operand is produced by a cmp+setcc, /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} /// with CMP+{ADC, SBB}. static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { bool IsSub = N->getOpcode() == ISD::SUB; SDValue X = N->getOperand(0); SDValue Y = N->getOperand(1); // If this is an add, canonicalize a zext operand to the RHS. // TODO: Incomplete? What if both sides are zexts? if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND && Y.getOpcode() != ISD::ZERO_EXTEND) std::swap(X, Y); // Look through a one-use zext. bool PeekedThroughZext = false; if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) { Y = Y.getOperand(0); PeekedThroughZext = true; } // If this is an add, canonicalize a setcc operand to the RHS. // TODO: Incomplete? What if both sides are setcc? // TODO: Should we allow peeking through a zext of the other operand? if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC && Y.getOpcode() != X86ISD::SETCC) std::swap(X, Y); if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse()) return SDValue(); SDLoc DL(N); EVT VT = N->getValueType(0); X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0); // If X is -1 or 0, then we have an opportunity to avoid constants required in // the general case below. auto *ConstantX = dyn_cast(X); if (ConstantX) { if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) || (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) { // This is a complicated way to get -1 or 0 from the carry flag: // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), Y.getOperand(1)); } if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) || (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) { SDValue EFLAGS = Y->getOperand(1); if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { // Swap the operands of a SUB, and we have the same pattern as above. // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB SDValue NewSub = DAG.getNode( X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), NewEFLAGS); } } } if (CC == X86::COND_B) { // X + SETB Z --> adc X, 0 // X - SETB Z --> sbb X, 0 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), X, DAG.getConstant(0, DL, VT), Y.getOperand(1)); } if (CC == X86::COND_A) { SDValue EFLAGS = Y->getOperand(1); // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), X, DAG.getConstant(0, DL, VT), NewEFLAGS); } } if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); SDValue Cmp = Y.getOperand(1); if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || !X86::isZeroNode(Cmp.getOperand(1)) || !Cmp.getOperand(0).getValueType().isInteger()) return SDValue(); SDValue Z = Cmp.getOperand(0); EVT ZVT = Z.getValueType(); // If X is -1 or 0, then we have an opportunity to avoid constants required in // the general case below. if (ConstantX) { // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with // fake operands: // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z) if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) || (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) { SDValue Zero = DAG.getConstant(0, DL, ZVT); SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), SDValue(Neg.getNode(), 1)); } // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb' // with fake operands: // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1) // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1) if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) || (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) { SDValue One = DAG.getConstant(1, DL, ZVT); SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1); } } // (cmp Z, 1) sets the carry flag if Z is 0. SDValue One = DAG.getConstant(1, DL, ZVT); SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); // Add the flags type for ADC/SBB nodes. SDVTList VTs = DAG.getVTList(VT, MVT::i32); // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) if (CC == X86::COND_NE) return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, DAG.getConstant(-1ULL, DL, VT), Cmp1); // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, DAG.getConstant(0, DL, VT), Cmp1); } static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); EVT VT = N->getValueType(0); // If the vector size is less than 128, or greater than the supported RegSize, // do not use PMADD. if (!VT.isVector() || VT.getVectorNumElements() < 8) return SDValue(); if (Op0.getOpcode() != ISD::MUL) std::swap(Op0, Op1); if (Op0.getOpcode() != ISD::MUL) return SDValue(); ShrinkMode Mode; if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16) return SDValue(); SDLoc DL(N); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, VT.getVectorNumElements() / 2); // Madd vector size is half of the original vector size auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; auto BuildPMADDWD = [&](SDValue Mul) { // Shrink the operands of mul. SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0)); SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1)); SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 }, PMADDWDBuilder); // Fill the rest of the output with 0 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, DAG.getConstant(0, DL, MAddVT)); }; Op0 = BuildPMADDWD(Op0); // It's possible that Op1 is also a mul we can reduce. if (Op1.getOpcode() == ISD::MUL && canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) { Op1 = BuildPMADDWD(Op1); } return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1); } static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // TODO: There's nothing special about i32, any integer type above i16 should // work just as well. if (!VT.isVector() || !VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) return SDValue(); unsigned RegSize = 128; if (Subtarget.useBWIRegs()) RegSize = 512; else if (Subtarget.hasAVX()) RegSize = 256; // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. if (VT.getSizeInBits() / 4 > RegSize) return SDValue(); // We know N is a reduction add, which means one of its operands is a phi. // To match SAD, we need the other operand to be a ABS. if (Op0.getOpcode() != ISD::ABS) std::swap(Op0, Op1); if (Op0.getOpcode() != ISD::ABS) return SDValue(); auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) { // SAD pattern detected. Now build a SAD instruction and an addition for // reduction. Note that the number of elements of the result of SAD is less // than the number of elements of its input. Therefore, we could only update // part of elements in the reduction vector. SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget); // The output of PSADBW is a vector of i64. // We need to turn the vector of i64 into a vector of i32. // If the reduction vector is at least as wide as the psadbw result, just // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero // anyway. MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); if (VT.getSizeInBits() >= ResVT.getSizeInBits()) Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); else Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad); if (VT.getSizeInBits() > ResVT.getSizeInBits()) { // Fill the upper elements with zero to match the add width. SDValue Zero = DAG.getConstant(0, DL, VT); Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad, DAG.getIntPtrConstant(0, DL)); } return Sad; }; // Check whether we have an abs-diff pattern feeding into the select. SDValue SadOp0, SadOp1; if (!detectZextAbsDiff(Op0, SadOp0, SadOp1)) return SDValue(); Op0 = BuildPSADBW(SadOp0, SadOp1); // It's possible we have a sad on the other side too. if (Op1.getOpcode() == ISD::ABS && detectZextAbsDiff(Op1, SadOp0, SadOp1)) { Op1 = BuildPSADBW(SadOp0, SadOp1); } return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1); } /// Convert vector increment or decrement to sub/add with an all-ones constant: /// add X, <1, 1...> --> sub X, <-1, -1...> /// sub X, <1, 1...> --> add X, <-1, -1...> /// The all-ones vector constant can be materialized using a pcmpeq instruction /// that is commonly recognized as an idiom (has no register dependency), so /// that's better/smaller than loading a splat 1 constant. static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) { assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && "Unexpected opcode for increment/decrement transform"); // Pseudo-legality check: getOnesVector() expects one of these types, so bail // out and wait for legalization if we have an unsupported vector length. EVT VT = N->getValueType(0); if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); APInt SplatVal; if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue()) return SDValue(); SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N)); unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec); } static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { // Example of pattern we try to detect: // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1)))) //(add (build_vector (extract_elt t, 0), // (extract_elt t, 2), // (extract_elt t, 4), // (extract_elt t, 6)), // (build_vector (extract_elt t, 1), // (extract_elt t, 3), // (extract_elt t, 5), // (extract_elt t, 7))) if (!Subtarget.hasSSE2()) return SDValue(); if (Op0.getOpcode() != ISD::BUILD_VECTOR || Op1.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 || VT.getVectorNumElements() < 4 || !isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); // Check if one of Op0,Op1 is of the form: // (build_vector (extract_elt Mul, 0), // (extract_elt Mul, 2), // (extract_elt Mul, 4), // ... // the other is of the form: // (build_vector (extract_elt Mul, 1), // (extract_elt Mul, 3), // (extract_elt Mul, 5), // ... // and identify Mul. SDValue Mul; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) { SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i), Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1); // TODO: Be more tolerant to undefs. if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); auto *Const0L = dyn_cast(Op0L->getOperand(1)); auto *Const1L = dyn_cast(Op1L->getOperand(1)); auto *Const0H = dyn_cast(Op0H->getOperand(1)); auto *Const1H = dyn_cast(Op1H->getOperand(1)); if (!Const0L || !Const1L || !Const0H || !Const1H) return SDValue(); unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(), Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue(); // Commutativity of mul allows factors of a product to reorder. if (Idx0L > Idx1L) std::swap(Idx0L, Idx1L); if (Idx0H > Idx1H) std::swap(Idx0H, Idx1H); // Commutativity of add allows pairs of factors to reorder. if (Idx0L > Idx0H) { std::swap(Idx0L, Idx0H); std::swap(Idx1L, Idx1H); } if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 || Idx1H != 2 * i + 3) return SDValue(); if (!Mul) { // First time an extract_elt's source vector is visited. Must be a MUL // with 2X number of vector elements than the BUILD_VECTOR. // Both extracts must be from same MUL. Mul = Op0L->getOperand(0); if (Mul->getOpcode() != ISD::MUL || Mul.getValueType().getVectorNumElements() != 2 * e) return SDValue(); } // Check that the extract is from the same MUL previously seen. if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) || Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0)) return SDValue(); } // Check if the Mul source can be safely shrunk. ShrinkMode Mode; if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16) return SDValue(); auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { // Shrink by adding truncate nodes and let DAGCombine fold with the // sources. EVT InVT = Ops[0].getValueType(); assert(InVT.getScalarType() == MVT::i32 && "Unexpected scalar element type"); assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements() / 2); EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, InVT.getVectorNumElements()); return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]), DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1])); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, { Mul.getOperand(0), Mul.getOperand(1) }, PMADDBuilder); } // Attempt to turn this pattern into PMADDWD. // (mul (add (zext (build_vector)), (zext (build_vector))), // (add (zext (build_vector)), (zext (build_vector))) static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) return SDValue(); if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 || VT.getVectorNumElements() < 4 || !isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDValue N10 = N1.getOperand(0); SDValue N11 = N1.getOperand(1); // All inputs need to be sign extends. // TODO: Support ZERO_EXTEND from known positive? if (N00.getOpcode() != ISD::SIGN_EXTEND || N01.getOpcode() != ISD::SIGN_EXTEND || N10.getOpcode() != ISD::SIGN_EXTEND || N11.getOpcode() != ISD::SIGN_EXTEND) return SDValue(); // Peek through the extends. N00 = N00.getOperand(0); N01 = N01.getOperand(0); N10 = N10.getOperand(0); N11 = N11.getOperand(0); // Must be extending from vXi16. EVT InVT = N00.getValueType(); if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT || N10.getValueType() != InVT || N11.getValueType() != InVT) return SDValue(); // All inputs should be build_vectors. if (N00.getOpcode() != ISD::BUILD_VECTOR || N01.getOpcode() != ISD::BUILD_VECTOR || N10.getOpcode() != ISD::BUILD_VECTOR || N11.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // For each element, we need to ensure we have an odd element from one vector // multiplied by the odd element of another vector and the even element from // one of the same vectors being multiplied by the even element from the // other vector. So we need to make sure for each element i, this operator // is being performed: // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1] SDValue In0, In1; for (unsigned i = 0; i != N00.getNumOperands(); ++i) { SDValue N00Elt = N00.getOperand(i); SDValue N01Elt = N01.getOperand(i); SDValue N10Elt = N10.getOperand(i); SDValue N11Elt = N11.getOperand(i); // TODO: Be more tolerant to undefs. if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); auto *ConstN00Elt = dyn_cast(N00Elt.getOperand(1)); auto *ConstN01Elt = dyn_cast(N01Elt.getOperand(1)); auto *ConstN10Elt = dyn_cast(N10Elt.getOperand(1)); auto *ConstN11Elt = dyn_cast(N11Elt.getOperand(1)); if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt) return SDValue(); unsigned IdxN00 = ConstN00Elt->getZExtValue(); unsigned IdxN01 = ConstN01Elt->getZExtValue(); unsigned IdxN10 = ConstN10Elt->getZExtValue(); unsigned IdxN11 = ConstN11Elt->getZExtValue(); // Add is commutative so indices can be reordered. if (IdxN00 > IdxN10) { std::swap(IdxN00, IdxN10); std::swap(IdxN01, IdxN11); } // N0 indices be the even element. N1 indices must be the next odd element. if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i || IdxN11 != 2 * i + 1) return SDValue(); SDValue N00In = N00Elt.getOperand(0); SDValue N01In = N01Elt.getOperand(0); SDValue N10In = N10Elt.getOperand(0); SDValue N11In = N11Elt.getOperand(0); // First time we find an input capture it. if (!In0) { In0 = N00In; In1 = N01In; } // Mul is commutative so the input vectors can be in any order. // Canonicalize to make the compares easier. if (In0 != N00In) std::swap(N00In, N01In); if (In0 != N10In) std::swap(N10In, N11In); if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In) return SDValue(); } auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { // Shrink by adding truncate nodes and let DAGCombine fold with the // sources. EVT OpVT = Ops[0].getValueType(); assert(OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"); assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, OpVT.getVectorNumElements() / 2); return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 }, PMADDBuilder); } static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { const SDNodeFlags Flags = N->getFlags(); if (Flags.hasVectorReduction()) { if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) return Sad; if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget)) return MAdd; } EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget)) return MAdd; if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget)) return MAdd; // Try to synthesize horizontal adds from adds of shuffles. if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) { auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); }; return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HADDBuilder); } if (SDValue V = combineIncDecVector(N, DAG)) return V; return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); EVT VT = N->getValueType(0); // PSUBUS is supported, starting from SSE2, but truncation for v8i32 // is only worth it with SSSE3 (PSHUFB). if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) && !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) && !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 || VT == MVT::v8i64))) return SDValue(); SDValue SubusLHS, SubusRHS; // Try to find umax(a,b) - b or a - umin(a,b) patterns // they may be converted to subus(a,b). // TODO: Need to add IR canonicalization for this code. if (Op0.getOpcode() == ISD::UMAX) { SubusRHS = Op1; SDValue MaxLHS = Op0.getOperand(0); SDValue MaxRHS = Op0.getOperand(1); if (MaxLHS == Op1) SubusLHS = MaxRHS; else if (MaxRHS == Op1) SubusLHS = MaxLHS; else return SDValue(); } else if (Op1.getOpcode() == ISD::UMIN) { SubusLHS = Op0; SDValue MinLHS = Op1.getOperand(0); SDValue MinRHS = Op1.getOperand(1); if (MinLHS == Op0) SubusRHS = MinRHS; else if (MinRHS == Op0) SubusRHS = MinLHS; else return SDValue(); } else return SDValue(); auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops); }; // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with // special preprocessing in some cases. if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64) return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { SubusLHS, SubusRHS }, USUBSATBuilder); // Special preprocessing case can be only applied // if the value was zero extended from 16 bit, // so we require first 16 bits to be zeros for 32 bit // values, or first 48 bits for 64 bit values. KnownBits Known = DAG.computeKnownBits(SubusLHS); unsigned NumZeros = Known.countMinLeadingZeros(); if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16) return SDValue(); EVT ExtType = SubusLHS.getValueType(); EVT ShrinkedType; if (VT == MVT::v8i32 || VT == MVT::v8i64) ShrinkedType = MVT::v8i16; else ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16; // If SubusLHS is zeroextended - truncate SubusRHS to it's // size SubusRHS = umin(0xFFF.., SubusRHS). SDValue SaturationConst = DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(), ShrinkedType.getScalarSizeInBits()), SDLoc(SubusLHS), ExtType); SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS, SaturationConst); SDValue NewSubusLHS = DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType); SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType); SDValue Psubus = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType, { NewSubusLHS, NewSubusRHS }, USUBSATBuilder); // Zero extend the result, it may be used somewhere as 32 bit, // if not zext and following trunc will shrink. return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // X86 can't encode an immediate LHS of a sub. See if we can push the // negation into a preceding instruction. if (ConstantSDNode *C = dyn_cast(Op0)) { // If the RHS of the sub is a XOR with one use and a constant, invert the // immediate. Then add one to the LHS of the sub so we can turn // X-Y -> X+~Y+1, saving one register. if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && isa(Op1.getOperand(1))) { const APInt &XorC = Op1.getConstantOperandAPInt(1); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), DAG.getConstant(~XorC, SDLoc(Op1), VT)); return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT)); } } // Try to synthesize horizontal subs from subs of shuffles. EVT VT = N->getValueType(0); if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) { auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); }; return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HSUBBuilder); } if (SDValue V = combineIncDecVector(N, DAG)) return V; // Try to create PSUBUS if SUB's argument is max/min if (SDValue V = combineSubToSubus(N, DAG, Subtarget)) return V; return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); SDLoc DL(N); if (N->getOperand(0) == N->getOperand(1)) { if (N->getOpcode() == X86ISD::PCMPEQ) return DAG.getConstant(-1, DL, VT); if (N->getOpcode() == X86ISD::PCMPGT) return DAG.getConstant(0, DL, VT); } return SDValue(); } /// Helper that combines an array of subvector ops as if they were the operands /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g. /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type. static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors"); if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); })) return DAG.getUNDEF(VT); if (llvm::all_of(Ops, [](SDValue Op) { return ISD::isBuildVectorAllZeros(Op.getNode()); })) return getZeroVector(VT, Subtarget, DAG, DL); SDValue Op0 = Ops[0]; // Fold subvector loads into one. // If needed, look through bitcasts to get to the load. if (auto *FirstLd = dyn_cast(peekThroughBitcasts(Op0))) { bool Fast; const X86TargetLowering *TLI = Subtarget.getTargetLowering(); if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, *FirstLd->getMemOperand(), &Fast) && Fast) { if (SDValue Ld = EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) return Ld; } } // Repeated subvectors. if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) { // If this broadcast/subv_broadcast is inserted into both halves, use a // larger broadcast/subv_broadcast. if (Op0.getOpcode() == X86ISD::VBROADCAST || Op0.getOpcode() == X86ISD::SUBV_BROADCAST) return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0)))) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64, Op0.getOperand(0), DAG.getIntPtrConstant(0, DL))); // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && (Subtarget.hasAVX2() || (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) && Op0.getOperand(0).getValueType() == VT.getScalarType()) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); } bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; }); // Repeated opcode. // TODO - combineX86ShufflesRecursively should handle shuffle concatenation // but it currently struggles with different vector widths. if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op.getOpcode() == Op0.getOpcode(); })) { unsigned NumOps = Ops.size(); switch (Op0.getOpcode()) { case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::PSHUFD: if (!IsSplat && NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) { SmallVector Src; for (unsigned i = 0; i != NumOps; ++i) Src.push_back(Ops[i].getOperand(0)); return DAG.getNode(Op0.getOpcode(), DL, VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src), Op0.getOperand(1)); } LLVM_FALLTHROUGH; case X86ISD::VPERMILPI: // TODO - add support for vXf64/vXi64 shuffles. if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) && Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) { SmallVector Src; for (unsigned i = 0; i != NumOps; ++i) Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0))); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src); Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res, Op0.getOperand(1)); return DAG.getBitcast(VT, Res); } break; case X86ISD::PACKUS: if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) { SmallVector LHS, RHS; for (unsigned i = 0; i != NumOps; ++i) { LHS.push_back(Ops[i].getOperand(0)); RHS.push_back(Ops[i].getOperand(1)); } MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), NumOps * SrcVT.getVectorNumElements()); return DAG.getNode(Op0.getOpcode(), DL, VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS), DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS)); } break; } } // If we're inserting all zeros into the upper half, change this to // an insert into an all zeros vector. We will match this to a move // with implicit upper bit zeroing during isel. if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode())) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), Ops[0], DAG.getIntPtrConstant(0, DL)); return SDValue(); } static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); EVT SrcVT = N->getOperand(0).getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Don't do anything for i1 vectors. if (VT.getVectorElementType() == MVT::i1) return SDValue(); if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) { SmallVector Ops(N->op_begin(), N->op_end()); if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG, DCI, Subtarget)) return R; } return SDValue(); } static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); MVT OpVT = N->getSimpleValueType(0); bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1; SDLoc dl(N); SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); uint64_t IdxVal = N->getConstantOperandVal(2); MVT SubVecVT = SubVec.getSimpleValueType(); if (Vec.isUndef() && SubVec.isUndef()) return DAG.getUNDEF(OpVT); // Inserting undefs/zeros into zeros/undefs is a zero vector. if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) && (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode()))) return getZeroVector(OpVT, Subtarget, DAG, dl); if (ISD::isBuildVectorAllZeros(Vec.getNode())) { // If we're inserting into a zero vector and then into a larger zero vector, // just insert into the larger zero vector directly. if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) { uint64_t Idx2Val = SubVec.getConstantOperandVal(2); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), SubVec.getOperand(1), DAG.getIntPtrConstant(IdxVal + Idx2Val, dl)); } // If we're inserting into a zero vector and our input was extracted from an // insert into a zero vector of the same type and the extraction was at // least as large as the original insertion. Just insert the original // subvector into a zero vector. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 && SubVec.getConstantOperandAPInt(1) == 0 && SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Ins = SubVec.getOperand(0); if (Ins.getConstantOperandAPInt(2) == 0 && ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), Ins.getOperand(1), N->getOperand(2)); } } // Stop here if this is an i1 vector. if (IsI1Vector) return SDValue(); // If this is an insert of an extract, combine to a shuffle. Don't do this // if the insert or extract can be represented with a subregister operation. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubVec.getOperand(0).getSimpleValueType() == OpVT && (IdxVal != 0 || !Vec.isUndef())) { int ExtIdxVal = SubVec.getConstantOperandVal(1); if (ExtIdxVal != 0) { int VecNumElts = OpVT.getVectorNumElements(); int SubVecNumElts = SubVecVT.getVectorNumElements(); SmallVector Mask(VecNumElts); // First create an identity shuffle mask. for (int i = 0; i != VecNumElts; ++i) Mask[i] = i; // Now insert the extracted portion. for (int i = 0; i != SubVecNumElts; ++i) Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask); } } // Match concat_vector style patterns. SmallVector SubVectorOps; if (collectConcatOps(N, SubVectorOps)) if (SDValue Fold = combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) return Fold; // If we are inserting into both halves of the vector, the starting vector // should be undef. If it isn't, make it so. Only do this if the early insert // has no other uses. // TODO: Should this be a generic DAG combine? // TODO: Why doesn't SimplifyDemandedVectorElts catch this? if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 && isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() && Vec.getOperand(1).getValueSizeInBits() == SubVecVT.getSizeInBits() && Vec.hasOneUse()) { Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), Vec.getOperand(1), Vec.getOperand(2)); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, N->getOperand(2)); } // If this is a broadcast insert into an upper undef, use a larger broadcast. if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); return SDValue(); } /// If we are extracting a subvector of a vector select and the select condition /// is composed of concatenated vectors, try to narrow the select width. This /// is a common pattern for AVX1 integer code because 256-bit selects may be /// legal, but there is almost no integer math/logic available for 256-bit. /// This function should only be called with legal types (otherwise, the calls /// to get simple value types will assert). static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) { SDValue Sel = peekThroughBitcasts(Ext->getOperand(0)); SmallVector CatOps; if (Sel.getOpcode() != ISD::VSELECT || !collectConcatOps(Sel.getOperand(0).getNode(), CatOps)) return SDValue(); // Note: We assume simple value types because this should only be called with // legal operations/types. // TODO: This can be extended to handle extraction to 256-bits. MVT VT = Ext->getSimpleValueType(0); if (!VT.is128BitVector()) return SDValue(); MVT SelCondVT = Sel.getOperand(0).getSimpleValueType(); if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector()) return SDValue(); MVT WideVT = Ext->getOperand(0).getSimpleValueType(); MVT SelVT = Sel.getSimpleValueType(); assert((SelVT.is256BitVector() || SelVT.is512BitVector()) && "Unexpected vector type with legal operations"); unsigned SelElts = SelVT.getVectorNumElements(); unsigned CastedElts = WideVT.getVectorNumElements(); unsigned ExtIdx = cast(Ext->getOperand(1))->getZExtValue(); if (SelElts % CastedElts == 0) { // The select has the same or more (narrower) elements than the extract // operand. The extraction index gets scaled by that factor. ExtIdx *= (SelElts / CastedElts); } else if (CastedElts % SelElts == 0) { // The select has less (wider) elements than the extract operand. Make sure // that the extraction index can be divided evenly. unsigned IndexDivisor = CastedElts / SelElts; if (ExtIdx % IndexDivisor != 0) return SDValue(); ExtIdx /= IndexDivisor; } else { llvm_unreachable("Element count of simple vector types are not divisible?"); } unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits(); unsigned NarrowElts = SelElts / NarrowingFactor; MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts); SDLoc DL(Ext); SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL); SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL); SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL); SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF); return DAG.getBitcast(VT, NarrowSel); } static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // For AVX1 only, if we are extracting from a 256-bit and+not (which will // eventually get combined/lowered into ANDNP) with a concatenated operand, // split the 'and' into 128-bit ops to avoid the concatenate and extract. // We let generic combining take over from there to simplify the // insert/extract and 'not'. // This pattern emerges during AVX1 legalization. We handle it before lowering // to avoid complications like splitting constant vector loads. // Capture the original wide type in the likely case that we need to bitcast // back to this type. if (!N->getValueType(0).isSimple()) return SDValue(); MVT VT = N->getSimpleValueType(0); EVT WideVecVT = N->getOperand(0).getValueType(); SDValue WideVec = peekThroughBitcasts(N->getOperand(0)); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(WideVecVT) && WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) { auto isConcatenatedNot = [] (SDValue V) { V = peekThroughBitcasts(V); if (!isBitwiseNot(V)) return false; SDValue NotOp = V->getOperand(0); return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS; }; if (isConcatenatedNot(WideVec.getOperand(0)) || isConcatenatedNot(WideVec.getOperand(1))) { // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1 SDValue Concat = split256IntArith(WideVec, DAG); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, DAG.getBitcast(WideVecVT, Concat), N->getOperand(1)); } } if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue V = narrowExtractedVectorSelect(N, DAG)) return V; SDValue InVec = N->getOperand(0); unsigned IdxVal = cast(N->getOperand(1))->getZExtValue(); if (ISD::isBuildVectorAllZeros(InVec.getNode())) return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); if (ISD::isBuildVectorAllOnes(InVec.getNode())) { if (VT.getScalarType() == MVT::i1) return DAG.getConstant(1, SDLoc(N), VT); return getOnesVector(VT, DAG, SDLoc(N)); } if (InVec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector( VT, SDLoc(N), InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements())); // Try to move vector bitcast after extract_subv by scaling extraction index: // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR if (InVec.getOpcode() == ISD::BITCAST && InVec.getOperand(0).getValueType().isVector()) { SDValue SrcOp = InVec.getOperand(0); EVT SrcVT = SrcOp.getValueType(); unsigned SrcNumElts = SrcVT.getVectorNumElements(); unsigned DestNumElts = InVec.getValueType().getVectorNumElements(); if ((DestNumElts % SrcNumElts) == 0) { unsigned DestSrcRatio = DestNumElts / SrcNumElts; if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), NewExtNumElts); if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; SDLoc DL(N); SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, SrcOp, NewIndex); return DAG.getBitcast(VT, NewExtract); } } } } // If we're extracting from a broadcast then we're better off just // broadcasting to the smaller type directly, assuming this is the only use. // As its a broadcast we don't care about the extraction index. if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() && InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { unsigned InOpcode = InVec.getOpcode(); if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0)); } // v2f64 CVTUDQ2PD(v4i32). if (InOpcode == ISD::UINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0)); } // v2f64 CVTPS2PD(v4f32). if (InOpcode == ISD::FP_EXTEND && InVec.getOperand(0).getValueType() == MVT::v4f32) { return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0)); } } if ((InOpcode == ISD::ANY_EXTEND || InOpcode == ISD::ANY_EXTEND_VECTOR_INREG || InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || InOpcode == ISD::SIGN_EXTEND || InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && VT.is128BitVector() && InVec.getOperand(0).getSimpleValueType().is128BitVector()) { unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode); return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0)); } if (InOpcode == ISD::VSELECT && InVec.getOperand(0).getValueType().is256BitVector() && InVec.getOperand(1).getValueType().is256BitVector() && InVec.getOperand(2).getValueType().is256BitVector()) { SDLoc DL(N); SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128); SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128); SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); } } return SDValue(); } static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and. // This occurs frequently in our masked scalar intrinsic code and our // floating point select lowering with AVX512. // TODO: SimplifyDemandedBits instead? if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse()) if (auto *C = dyn_cast(Src.getOperand(1))) if (C->getAPIntValue().isOneValue()) return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0)); // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec. if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() && Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1) if (auto *C = dyn_cast(Src.getOperand(1))) if (C->isNullValue()) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0), Src.getOperand(1)); // Reduce v2i64 to v4i32 if we don't need the upper bits. // TODO: Move to DAGCombine? if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND && Src.getValueType() == MVT::i64 && Src.hasOneUse() && Src.getOperand(0).getScalarValueSizeInBits() <= 32) return DAG.getBitcast( VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32))); return SDValue(); } // Simplify PMULDQ and PMULUDQ operations. static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // Canonicalize constant to RHS. if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) && !DAG.isConstantIntBuildVectorOrConstantInt(RHS)) return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS); // Multiply by zero. + // Don't return RHS as it may contain UNDEFs. if (ISD::isBuildVectorAllZeros(RHS.getNode())) - return RHS; + return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N)); // Aggressively peek through ops to get at the demanded low bits. APInt DemandedMask = APInt::getLowBitsSet(64, 32); SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask); SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask); if (DemandedLHS || DemandedRHS) return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), DemandedLHS ? DemandedLHS : LHS, DemandedRHS ? DemandedRHS : RHS); // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI)) return SDValue(N, 0); return SDValue(); } static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Try to merge vector loads and extend_inreg to an extload. if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { auto *Ld = cast(In); if (!Ld->isVolatile()) { MVT SVT = In.getSimpleValueType().getVectorElementType(); ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements()); if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { SDValue Load = DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), MemVT, Ld->getAlignment(), Ld->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); return Load; } } } // Disabling for widening legalization for now. We can enable if we find a // case that needs it. Otherwise it can be deleted when we switch to // widening legalization. if (ExperimentalVectorWideningLegalization) return SDValue(); // Combine (ext_invec (ext_invec X)) -> (ext_invec X) if (In.getOpcode() == N->getOpcode() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); // Attempt to combine as a shuffle. // TODO: SSE41 support if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) { SDValue Op(N, 0); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } return SDValue(); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; case ISD::SCALAR_TO_VECTOR: return combineScalarToVector(N, DAG); case ISD::EXTRACT_VECTOR_ELT: case X86ISD::PEXTRW: case X86ISD::PEXTRB: return combineExtractVectorElt(N, DAG, DCI, Subtarget); case ISD::CONCAT_VECTORS: return combineConcatVectors(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::EXTRACT_SUBVECTOR: return combineExtractSubvector(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget); case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case X86ISD::CMP: return combineCMP(N, DAG); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); case X86ISD::ADD: case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI); case X86ISD::SBB: return combineSBB(N, DAG); case X86ISD::ADC: return combineADC(N, DAG, DCI); case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); case ISD::SHL: return combineShiftLeft(N, DAG); case ISD::SRA: return combineShiftRightArithmetic(N, DAG); case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI); case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget); case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget); case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return combineFOr(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); case ISD::ANY_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI, Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); case X86ISD::PACKSS: case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget); case X86ISD::VSHL: case X86ISD::VSRA: case X86ISD::VSRL: return combineVectorShiftVar(N, DAG, DCI, Subtarget); case X86ISD::VSHLI: case X86ISD::VSRAI: case X86ISD::VSRLI: return combineVectorShiftImm(N, DAG, DCI, Subtarget); case X86ISD::PINSRB: case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::VBROADCAST: case X86ISD::VPPERM: case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VPERMIL2: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::SHUF128: case X86ISD::VZEXT_MOVL: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case X86ISD::FMADD_RND: case X86ISD::FMSUB: case X86ISD::FMSUB_RND: case X86ISD::FNMADD: case X86ISD::FNMADD_RND: case X86ISD::FNMSUB: case X86ISD::FNMSUB_RND: case ISD::FMA: return combineFMA(N, DAG, Subtarget); case X86ISD::FMADDSUB_RND: case X86ISD::FMSUBADD_RND: case X86ISD::FMADDSUB: case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget); case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI); case X86ISD::MGATHER: case X86ISD::MSCATTER: case ISD::MGATHER: case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); case X86ISD::PMULDQ: - case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI); + case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); } return SDValue(); } bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; // There are no vXi8 shifts. if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8) return false; // TODO: Almost no 8-bit ops are desirable because they have no actual // size/speed advantages vs. 32-bit ops, but they do have a major // potential disadvantage by causing partial register stalls. // // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and // we have specializations to turn 32-bit multiply/shl into LEA or other ops. // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally // check for a constant operand to the multiply. if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8) return false; // i16 instruction encodings are longer and some i16 instructions are slow, // so those are not desirable. if (VT == MVT::i16) { switch (Opc) { default: break; case ISD::LOAD: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::SUB: case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: return false; } } // Any legal type not explicitly accounted for above here is desirable. return true; } SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl, SDValue Value, SDValue Addr, SelectionDAG &DAG) const { const Module *M = DAG.getMachineFunction().getMMI().getModule(); Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); if (IsCFProtectionSupported) { // In case control-flow branch protection is enabled, we need to add // notrack prefix to the indirect branch. // In order to do that we create NT_BRIND SDNode. // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix. return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr); } return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG); } bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { EVT VT = Op.getValueType(); bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL && isa(Op.getOperand(1)); // i16 is legal, but undesirable since i16 instruction encodings are longer // and some i16 instructions are slow. // 8-bit multiply-by-constant can usually be expanded to something cheaper // using LEA and/or other ALU ops. if (VT != MVT::i16 && !Is8BitMulByConstant) return false; auto IsFoldableRMW = [](SDValue Load, SDValue Op) { if (!Op.hasOneUse()) return false; SDNode *User = *Op->use_begin(); if (!ISD::isNormalStore(User)) return false; auto *Ld = cast(Load); auto *St = cast(User); return Ld->getBasePtr() == St->getBasePtr(); }; auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) { if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD) return false; if (!Op.hasOneUse()) return false; SDNode *User = *Op->use_begin(); if (User->getOpcode() != ISD::ATOMIC_STORE) return false; auto *Ld = cast(Load); auto *St = cast(User); return Ld->getBasePtr() == St->getBasePtr(); }; bool Commute = false; switch (Op.getOpcode()) { default: return false; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: break; case ISD::SHL: case ISD::SRA: case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op)) return false; break; } case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: Commute = true; LLVM_FALLTHROUGH; case ISD::SUB: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); // Avoid disabling potential load folding opportunities. if (MayFoldLoad(N1) && (!Commute || !isa(N0) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op)))) return false; if (MayFoldLoad(N0) && ((Commute && !isa(N1)) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op)))) return false; if (IsFoldableAtomicRMW(N0, Op) || (Commute && IsFoldableAtomicRMW(N1, Op))) return false; } } PVT = MVT::i32; return true; } bool X86TargetLowering:: isDesirableToCombineBuildVectorToShuffleTruncate( ArrayRef ShuffleMask, EVT SrcVT, EVT TruncVT) const { assert(SrcVT.getVectorNumElements() == ShuffleMask.size() && "Element count mismatch"); assert( Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && "Shuffle Mask expected to be legal"); // For 32-bit elements VPERMD is better than shuffle+truncate. // TODO: After we improve lowerBuildVector, add execption for VPERMW. if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2()) return false; if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask)) return false; return true; } //===----------------------------------------------------------------------===// // X86 Inline Assembly Support //===----------------------------------------------------------------------===// // Helper to match a string separated by whitespace. static bool matchAsm(StringRef S, ArrayRef Pieces) { S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. for (StringRef Piece : Pieces) { if (!S.startswith(Piece)) // Check if the piece matches. return false; S = S.substr(Piece.size()); StringRef::size_type Pos = S.find_first_not_of(" \t"); if (Pos == 0) // We matched a prefix. return false; S = S.substr(Pos); } return S.empty(); } static bool clobbersFlagRegisters(const SmallVector &AsmPieces) { if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { if (AsmPieces.size() == 3) return true; else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) return true; } } return false; } bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast(CI->getCalledValue()); const std::string &AsmStr = IA->getAsmString(); IntegerType *Ty = dyn_cast(CI->getType()); if (!Ty || Ty->getBitWidth() % 16 != 0) return false; // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" SmallVector AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); switch (AsmPieces.size()) { default: return false; case 1: // FIXME: this should verify that we are targeting a 486 or better. If not, // we will turn this bswap into something that will be lowered to logical // ops instead of emitting the bswap asm. For now, we don't support 486 or // lower so don't worry about this. // bswap $0 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || matchAsm(AsmPieces[0], {"bswapl", "$0"}) || matchAsm(AsmPieces[0], {"bswapq", "$0"}) || matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. return IntrinsicLowering::LowerToByteSwap(CI); } // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { AsmPieces.clear(); StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: if (CI->getType()->isIntegerTy(32) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { AsmPieces.clear(); StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } if (CI->getType()->isIntegerTy(64)) { InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); if (Constraints.size() >= 2 && Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && matchAsm(AsmPieces[1], {"bswap", "%edx"}) && matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) return IntrinsicLowering::LowerToByteSwap(CI); } } break; } return false; } static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) { X86::CondCode Cond = StringSwitch(Constraint) .Case("{@cca}", X86::COND_A) .Case("{@ccae}", X86::COND_AE) .Case("{@ccb}", X86::COND_B) .Case("{@ccbe}", X86::COND_BE) .Case("{@ccc}", X86::COND_B) .Case("{@cce}", X86::COND_E) .Case("{@ccz}", X86::COND_E) .Case("{@ccg}", X86::COND_G) .Case("{@ccge}", X86::COND_GE) .Case("{@ccl}", X86::COND_L) .Case("{@ccle}", X86::COND_LE) .Case("{@ccna}", X86::COND_BE) .Case("{@ccnae}", X86::COND_B) .Case("{@ccnb}", X86::COND_AE) .Case("{@ccnbe}", X86::COND_A) .Case("{@ccnc}", X86::COND_AE) .Case("{@ccne}", X86::COND_NE) .Case("{@ccnz}", X86::COND_NE) .Case("{@ccng}", X86::COND_LE) .Case("{@ccnge}", X86::COND_L) .Case("{@ccnl}", X86::COND_GE) .Case("{@ccnle}", X86::COND_G) .Case("{@ccno}", X86::COND_NO) .Case("{@ccnp}", X86::COND_P) .Case("{@ccns}", X86::COND_NS) .Case("{@cco}", X86::COND_O) .Case("{@ccp}", X86::COND_P) .Case("{@ccs}", X86::COND_S) .Default(X86::COND_INVALID); return Cond; } /// Given a constraint letter, return the type of constraint for this target. X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'R': case 'q': case 'Q': case 'f': case 't': case 'u': case 'y': case 'x': case 'v': case 'Y': case 'l': case 'k': // AVX512 masking registers. return C_RegisterClass; case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': return C_Register; case 'I': case 'J': case 'K': case 'N': case 'G': case 'L': case 'M': return C_Immediate; case 'C': case 'e': case 'Z': return C_Other; default: break; } } else if (Constraint.size() == 2) { switch (Constraint[0]) { default: break; case 'Y': switch (Constraint[1]) { default: break; case 'z': case '0': return C_Register; case 'i': case 'm': case 'k': case 't': case '2': return C_RegisterClass; } } } else if (parseConstraintCode(Constraint) != X86::COND_INVALID) return C_Other; return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight X86TargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); LLVM_FALLTHROUGH; case 'R': case 'q': case 'Q': case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': if (CallOperandVal->getType()->isIntegerTy()) weight = CW_SpecificReg; break; case 'f': case 't': case 'u': if (type->isFloatingPointTy()) weight = CW_SpecificReg; break; case 'y': if (type->isX86_MMXTy() && Subtarget.hasMMX()) weight = CW_SpecificReg; break; case 'Y': { unsigned Size = StringRef(constraint).size(); // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y' char NextChar = Size == 2 ? constraint[1] : 'i'; if (Size > 2) break; switch (NextChar) { default: return CW_Invalid; // XMM0 case 'z': case '0': if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) return CW_SpecificReg; return CW_Invalid; // Conditional OpMask regs (AVX512) case 'k': if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) return CW_Register; return CW_Invalid; // Any MMX reg case 'm': if (type->isX86_MMXTy() && Subtarget.hasMMX()) return weight; return CW_Invalid; // Any SSE reg when ISA >= SSE2, same as 'Y' case 'i': case 't': case '2': if (!Subtarget.hasSSE2()) return CW_Invalid; break; } // Fall through (handle "Y" constraint). LLVM_FALLTHROUGH; } case 'v': if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()) weight = CW_Register; LLVM_FALLTHROUGH; case 'x': if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX())) weight = CW_Register; break; case 'k': // Enable conditional vector operations using %k<#> registers. if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) weight = CW_Register; break; case 'I': if (ConstantInt *C = dyn_cast(info.CallOperandVal)) { if (C->getZExtValue() <= 31) weight = CW_Constant; } break; case 'J': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 63) weight = CW_Constant; } break; case 'K': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) weight = CW_Constant; } break; case 'L': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) weight = CW_Constant; } break; case 'M': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 3) weight = CW_Constant; } break; case 'N': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xff) weight = CW_Constant; } break; case 'G': case 'C': if (isa(CallOperandVal)) { weight = CW_Constant; } break; case 'e': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if ((C->getSExtValue() >= -0x80000000LL) && (C->getSExtValue() <= 0x7fffffffLL)) weight = CW_Constant; } break; case 'Z': if (ConstantInt *C = dyn_cast(CallOperandVal)) { if (C->getZExtValue() <= 0xffffffff) weight = CW_Constant; } break; } return weight; } /// Try to replace an X constraint, which matches anything, with another that /// has more specific requirements based on the type of the corresponding /// operand. const char *X86TargetLowering:: LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { if (Subtarget.hasSSE2()) return "Y"; if (Subtarget.hasSSE1()) return "x"; } return TargetLowering::LowerXConstraint(ConstraintVT); } // Lower @cc targets via setcc. SDValue X86TargetLowering::LowerAsmOutputForConstraint( SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); if (Cond == X86::COND_INVALID) return SDValue(); // Check that return type is valid. if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() || OpInfo.ConstraintVT.getSizeInBits() < 8) report_fatal_error("Flag output operand is of invalid type"); // Get EFLAGS register. Only update chain when copyfrom is glued. if (Flag.getNode()) { Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag); Chain = Flag.getValue(1); } else Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32); // Extract CC code. SDValue CC = getSETCC(Cond, Flag, DL, DAG); // Extend to 32-bits SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC); return Result; } /// Lower the specified operand into the Ops vector. /// If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result; // Only support length 1 constraints for now. if (Constraint.length() > 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; case 'I': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 31) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'J': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 63) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'K': if (ConstantSDNode *C = dyn_cast(Op)) { if (isInt<8>(C->getSExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'L': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) { Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'M': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 3) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'N': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 255) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'O': if (ConstantSDNode *C = dyn_cast(Op)) { if (C->getZExtValue() <= 127) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'e': { // 32-bit signed value if (ConstantSDNode *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); break; } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. } return; } case 'Z': { // 32-bit unsigned value if (ConstantSDNode *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getZExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. return; } case 'i': { // Literal immediates are always ok. if (ConstantSDNode *CST = dyn_cast(Op)) { bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1; BooleanContent BCont = getBooleanContents(MVT::i64); ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND; int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue() : CST->getSExtValue(); Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64); break; } // In any sort of PIC mode addresses need to be computed at runtime by // adding in a register or some sort of table lookup. These can't // be used as immediates. if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) return; // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. if (auto *GA = dyn_cast(Op)) // If we require an extra load to get this address, as in PIC mode, we // can't accept it. if (isGlobalStubReference( Subtarget.classifyGlobalReference(GA->getGlobal()))) return; break; } } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } /// Check if \p RC is a general purpose register class. /// I.e., GR* or one of their variant. static bool isGRClass(const TargetRegisterClass &RC) { return RC.hasSuperClassEq(&X86::GR8RegClass) || RC.hasSuperClassEq(&X86::GR16RegClass) || RC.hasSuperClassEq(&X86::GR32RegClass) || RC.hasSuperClassEq(&X86::GR64RegClass) || RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass); } /// Check if \p RC is a vector register class. /// I.e., FR* / VR* or one of their variant. static bool isFRClass(const TargetRegisterClass &RC) { return RC.hasSuperClassEq(&X86::FR32XRegClass) || RC.hasSuperClassEq(&X86::FR64XRegClass) || RC.hasSuperClassEq(&X86::VR128XRegClass) || RC.hasSuperClassEq(&X86::VR256XRegClass) || RC.hasSuperClassEq(&X86::VR512RegClass); } /// Check if \p RC is a mask register class. /// I.e., VK* or one of their variant. static bool isVKClass(const TargetRegisterClass &RC) { return RC.hasSuperClassEq(&X86::VK1RegClass) || RC.hasSuperClassEq(&X86::VK2RegClass) || RC.hasSuperClassEq(&X86::VK4RegClass) || RC.hasSuperClassEq(&X86::VK8RegClass) || RC.hasSuperClassEq(&X86::VK16RegClass) || RC.hasSuperClassEq(&X86::VK32RegClass) || RC.hasSuperClassEq(&X86::VK64RegClass); } std::pair X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. if (Constraint.size() == 1) { // GCC Constraint Letters switch (Constraint[0]) { default: break; // 'A' means [ER]AX + [ER]DX. case 'A': if (Subtarget.is64Bit()) return std::make_pair(X86::RAX, &X86::GR64_ADRegClass); assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"); return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); // TODO: Slight differences here in allocation order and leaving // RIP in the class. Do they matter any more here than they do // in the normal allocation? case 'k': if (Subtarget.hasAVX512()) { if (VT == MVT::i1) return std::make_pair(0U, &X86::VK1RegClass); if (VT == MVT::i8) return std::make_pair(0U, &X86::VK8RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::VK16RegClass); } if (Subtarget.hasBWI()) { if (VT == MVT::i32) return std::make_pair(0U, &X86::VK32RegClass); if (VT == MVT::i64) return std::make_pair(0U, &X86::VK64RegClass); } break; case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget.is64Bit()) { if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i64 || VT == MVT::f64) return std::make_pair(0U, &X86::GR64RegClass); break; } LLVM_FALLTHROUGH; // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32_ABCDRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_ABCDRegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); if (VT == MVT::i64) return std::make_pair(0U, &X86::GR64_ABCDRegClass); break; case 'r': // GENERAL_REGS case 'l': // INDEX_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit()) return std::make_pair(0U, &X86::GR32RegClass); return std::make_pair(0U, &X86::GR64RegClass); case 'R': // LEGACY_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_NOREXRegClass); if (VT == MVT::i32 || !Subtarget.is64Bit()) return std::make_pair(0U, &X86::GR32_NOREXRegClass); return std::make_pair(0U, &X86::GR64_NOREXRegClass); case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the // value to the correct fpstack register class. if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP32RegClass); if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP64RegClass); return std::make_pair(0U, &X86::RFP80RegClass); case 'y': // MMX_REGS if MMX allowed. if (!Subtarget.hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'Y': // SSE_REGS if SSE2 allowed if (!Subtarget.hasSSE2()) break; LLVM_FALLTHROUGH; case 'v': case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget.hasSSE1()) break; bool VConstraint = (Constraint[0] == 'v'); switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f32: case MVT::i32: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR32XRegClass); return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: case MVT::i64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR64XRegClass); return std::make_pair(0U, &X86::FR64RegClass); // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR128XRegClass); return std::make_pair(0U, &X86::VR128RegClass); // AVX types. case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR256XRegClass); if (Subtarget.hasAVX()) return std::make_pair(0U, &X86::VR256RegClass); break; case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: if (!Subtarget.hasAVX512()) break; if (VConstraint) return std::make_pair(0U, &X86::VR512RegClass); return std::make_pair(0U, &X86::VR512_0_15RegClass); } break; } } else if (Constraint.size() == 2 && Constraint[0] == 'Y') { switch (Constraint[1]) { default: break; case 'i': case 't': case '2': return getRegForInlineAsmConstraint(TRI, "Y", VT); case 'm': if (!Subtarget.hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'z': case '0': if (!Subtarget.hasSSE1()) break; return std::make_pair(X86::XMM0, &X86::VR128RegClass); case 'k': // This register class doesn't allocate k0 for masked vector operation. if (Subtarget.hasAVX512()) { if (VT == MVT::i1) return std::make_pair(0U, &X86::VK1WMRegClass); if (VT == MVT::i8) return std::make_pair(0U, &X86::VK8WMRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::VK16WMRegClass); } if (Subtarget.hasBWI()) { if (VT == MVT::i32) return std::make_pair(0U, &X86::VK32WMRegClass); if (VT == MVT::i64) return std::make_pair(0U, &X86::VK64WMRegClass); } break; } } if (parseConstraintCode(Constraint) != X86::COND_INVALID) return std::make_pair(0U, &X86::GR32RegClass); // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && Constraint[3] == '(' && (Constraint[4] >= '0' && Constraint[4] <= '7') && Constraint[5] == ')' && Constraint[6] == '}') { // st(7) is not allocatable and thus not a member of RFP80. Return // singleton class in cases where we have a reference to it. if (Constraint[4] == '7') return std::make_pair(X86::FP7, &X86::RFP80_7RegClass); return std::make_pair(X86::FP0 + Constraint[4] - '0', &X86::RFP80RegClass); } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_lower(Constraint)) return std::make_pair(X86::FP0, &X86::RFP80RegClass); // flags -> EFLAGS if (StringRef("{flags}").equals_lower(Constraint)) return std::make_pair(X86::EFLAGS, &X86::CCRRegClass); // dirflag -> DF if (StringRef("{dirflag}").equals_lower(Constraint)) return std::make_pair(X86::DF, &X86::DFCCRRegClass); // fpsr -> FPSW if (StringRef("{fpsr}").equals_lower(Constraint)) return std::make_pair(X86::FPSW, &X86::FPCCRRegClass); return Res; } // Make sure it isn't a register that requires 64-bit mode. if (!Subtarget.is64Bit() && (isFRClass(*Res.second) || isGRClass(*Res.second)) && TRI->getEncodingValue(Res.first) >= 8) { // Register requires REX prefix, but we're in 32-bit mode. return std::make_pair(0, nullptr); } // Make sure it isn't a register that requires AVX512. if (!Subtarget.hasAVX512() && isFRClass(*Res.second) && TRI->getEncodingValue(Res.first) & 0x10) { // Register requires EVEX prefix. return std::make_pair(0, nullptr); } // Otherwise, check to see if this is a register class of the wrong value // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. // MVT::Other is used to specify clobber names. if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other) return Res; // Correct type already, nothing to do. // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should // return "eax". This should even work for things like getting 64bit integer // registers when given an f64 type. const TargetRegisterClass *Class = Res.second; // The generic code will match the first register class that contains the // given register. Thus, based on the ordering of the tablegened file, // the "plain" GR classes might not come first. // Therefore, use a helper method. if (isGRClass(*Class)) { unsigned Size = VT.getSizeInBits(); if (Size == 1) Size = 8; unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { bool is64Bit = Subtarget.is64Bit(); const TargetRegisterClass *RC = Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass) : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass) : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass) : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr) : nullptr; if (Size == 64 && !is64Bit) { // Model GCC's behavior here and select a fixed pair of 32-bit // registers. switch (DestReg) { case X86::RAX: return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); case X86::RDX: return std::make_pair(X86::EDX, &X86::GR32_DCRegClass); case X86::RCX: return std::make_pair(X86::ECX, &X86::GR32_CBRegClass); case X86::RBX: return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass); case X86::RSI: return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass); case X86::RDI: return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass); case X86::RBP: return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass); default: return std::make_pair(0, nullptr); } } if (RC && RC->contains(DestReg)) return std::make_pair(DestReg, RC); return Res; } // No register found/type mismatch. return std::make_pair(0, nullptr); } else if (isFRClass(*Class)) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can // find, ignoring the required type. // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32XRegClass; else if (VT == MVT::f64 || VT == MVT::i64) Res.second = &X86::FR64XRegClass; else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT)) Res.second = &X86::VR128XRegClass; else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT)) Res.second = &X86::VR256XRegClass; else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) Res.second = &X86::VR512RegClass; else { // Type mismatch and not a clobber: Return an error; Res.first = 0; Res.second = nullptr; } } else if (isVKClass(*Class)) { if (VT == MVT::i1) Res.second = &X86::VK1RegClass; else if (VT == MVT::i8) Res.second = &X86::VK8RegClass; else if (VT == MVT::i16) Res.second = &X86::VK16RegClass; else if (VT == MVT::i32) Res.second = &X86::VK32RegClass; else if (VT == MVT::i64) Res.second = &X86::VK64RegClass; else { // Type mismatch and not a clobber: Return an error; Res.first = 0; Res.second = nullptr; } } return Res; } int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // Scaling factors are not free at all. // An indexed folded instruction, i.e., inst (reg1, reg2, scale), // will take 2 allocations in the out of order engine instead of 1 // for plain addressing mode, i.e. inst (reg1). // E.g., // vaddps (%rsi,%rdx), %ymm0, %ymm1 // Requires two allocations (one for the load, one for the computation) // whereas: // vaddps (%rsi), %ymm0, %ymm1 // Requires just 1 allocation, i.e., freeing allocations for other operations // and having less micro operations to execute. // // For some X86 architectures, this is even worse because for instance for // stores, the complex addressing mode forces the instruction to use the // "load" ports instead of the dedicated "store" port. // E.g., on Haswell: // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 // as soon as we use a second register. return AM.Scale != 0; return -1; } bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller // than the alternative sequence. // The exception to this is vector division. Since x86 doesn't have vector // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { if (!Subtarget.is64Bit()) return; // Update IsSplitCSR in X86MachineFunctionInfo. X86MachineFunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void X86TargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (X86::GR64RegClass.contains(*I)) RC = &X86::GR64RegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert( Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } bool X86TargetLowering::supportSwiftError() const { return Subtarget.is64Bit(); } /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { // If the function specifically requests stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); // Generally, if we aren't on Windows, the platform ABI does not include // support for stack probes, so don't emit them. if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() || MF.getFunction().hasFnAttribute("no-stack-arg-probe")) return ""; // We need a stack probe to conform to the Windows ABI. Choose the right // symbol. if (Subtarget.is64Bit()) return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk"; return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; } Index: vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.h =================================================================== --- vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.h (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.h (revision 351709) @@ -1,1651 +1,1647 @@ //===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/Target/TargetOptions.h" namespace llvm { class X86Subtarget; class X86TargetMachine; namespace X86ISD { // X86 Specific DAG Nodes enum NodeType : unsigned { // Start the numbering where the builtin ops leave off. FIRST_NUMBER = ISD::BUILTIN_OP_END, /// Bit scan forward. BSF, /// Bit scan reverse. BSR, /// Double shift instructions. These correspond to /// X86::SHLDxx and X86::SHRDxx instructions. SHLD, SHRD, /// Bitwise logical AND of floating point values. This corresponds /// to X86::ANDPS or X86::ANDPD. FAND, /// Bitwise logical OR of floating point values. This corresponds /// to X86::ORPS or X86::ORPD. FOR, /// Bitwise logical XOR of floating point values. This corresponds /// to X86::XORPS or X86::XORPD. FXOR, /// Bitwise logical ANDNOT of floating point values. This /// corresponds to X86::ANDNPS or X86::ANDNPD. FANDN, /// These operations represent an abstract X86 call /// instruction, which includes a bunch of information. In particular the /// operands of these node are: /// /// #0 - The incoming token chain /// #1 - The callee /// #2 - The number of arg bytes the caller pushes on the stack. /// #3 - The number of arg bytes the callee pops off the stack. /// #4 - The value to pass in AL/AX/EAX (optional) /// #5 - The value to pass in DL/DX/EDX (optional) /// /// The result values of these nodes are: /// /// #0 - The outgoing token chain /// #1 - The first register result value (optional) /// #2 - The second register result value (optional) /// CALL, /// Same as call except it adds the NoTrack prefix. NT_CALL, /// X86 compare and logical compare instructions. CMP, COMI, UCOMI, /// X86 bit-test instructions. BT, /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS /// operand, usually produced by a CMP instruction. SETCC, /// X86 Select SELECTS, // Same as SETCC except it's materialized with a sbb and the value is all // one's or all zero's. SETCC_CARRY, // R = carry_bit ? ~0 : 0 /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. /// Operands are two FP values to compare; result is a mask of /// 0s or 1s. Generally DTRT for C/C++ with NaNs. FSETCC, /// X86 FP SETCC, similar to above, but with output as an i1 mask and /// and a version with SAE. FSETCCM, FSETCCM_SAE, /// X86 conditional moves. Operand 0 and operand 1 are the two values /// to select from. Operand 2 is the condition code, and operand 3 is the /// flag operand produced by a CMP or TEST instruction. CMOV, /// X86 conditional branches. Operand 0 is the chain operand, operand 1 /// is the block to branch if condition is true, operand 2 is the /// condition code, and operand 3 is the flag operand produced by a CMP /// or TEST instruction. BRCOND, /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and /// operand 1 is the target address. NT_BRIND, /// Return with a flag operand. Operand 0 is the chain operand, operand /// 1 is the number of bytes of stack to pop. RET_FLAG, /// Return from interrupt. Operand 0 is the number of bytes to pop. IRET, /// Repeat fill, corresponds to X86::REP_STOSx. REP_STOS, /// Repeat move, corresponds to X86::REP_MOVSx. REP_MOVS, /// On Darwin, this node represents the result of the popl /// at function entry, used for PIC code. GlobalBaseReg, /// A wrapper node for TargetConstantPool, TargetJumpTable, /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, /// MCSymbol and TargetBlockAddress. Wrapper, /// Special wrapper used under X86-64 PIC mode for RIP /// relative displacements. WrapperRIP, /// Copies a 64-bit value from the low word of an XMM vector /// to an MMX vector. MOVDQ2Q, /// Copies a 32-bit value from the low word of a MMX /// vector to a GPR. MMX_MOVD2W, /// Copies a GPR into the low 32-bit word of a MMX vector /// and zero out the high word. MMX_MOVW2D, /// Extract an 8-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRB. PEXTRB, /// Extract a 16-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRW. PEXTRW, /// Insert any element of a 4 x float vector into any element /// of a destination 4 x floatvector. INSERTPS, /// Insert the lower 8-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRB. PINSRB, /// Insert the lower 16-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRW. PINSRW, /// Shuffle 16 8-bit values within a vector. PSHUFB, /// Compute Sum of Absolute Differences. PSADBW, /// Compute Double Block Packed Sum-Absolute-Differences DBPSADBW, /// Bitwise Logical AND NOT of Packed FP values. ANDNP, /// Blend where the selector is an immediate. BLENDI, /// Dynamic (non-constant condition) vector blend where only the sign bits /// of the condition elements are used. This is used to enforce that the /// condition mask is not valid for generic VSELECT optimizations. This /// is also used to implement the intrinsics. /// Operands are in VSELECT order: MASK, TRUE, FALSE BLENDV, /// Combined add and sub on an FP vector. ADDSUB, // FP vector ops with rounding mode. FADD_RND, FADDS, FADDS_RND, FSUB_RND, FSUBS, FSUBS_RND, FMUL_RND, FMULS, FMULS_RND, FDIV_RND, FDIVS, FDIVS_RND, FMAX_SAE, FMAXS_SAE, FMIN_SAE, FMINS_SAE, FSQRT_RND, FSQRTS, FSQRTS_RND, // FP vector get exponent. FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE, // Extract Normalized Mantissas. VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE, // FP Scale. SCALEF, SCALEF_RND, SCALEFS, SCALEFS_RND, // Unsigned Integer average. AVG, /// Integer horizontal add/sub. HADD, HSUB, /// Floating point horizontal add/sub. FHADD, FHSUB, // Detect Conflicts Within a Vector CONFLICT, /// Floating point max and min. FMAX, FMIN, /// Commutative FMIN and FMAX. FMAXC, FMINC, /// Scalar intrinsic floating point max and min. FMAXS, FMINS, /// Floating point reciprocal-sqrt and reciprocal approximation. /// Note that these typically require refinement /// in order to obtain suitable precision. FRSQRT, FRCP, // AVX-512 reciprocal approximations with a little more precision. RSQRT14, RSQRT14S, RCP14, RCP14S, // Thread Local Storage. TLSADDR, // Thread Local Storage. A call to get the start address // of the TLS block for the current module. TLSBASEADDR, // Thread Local Storage. When calling to an OS provided // thunk at the address from an earlier relocation. TLSCALL, // Exception Handling helpers. EH_RETURN, // SjLj exception handling setjmp. EH_SJLJ_SETJMP, // SjLj exception handling longjmp. EH_SJLJ_LONGJMP, // SjLj exception handling dispatch. EH_SJLJ_SETUP_DISPATCH, /// Tail call return. See X86TargetLowering::LowerCall for /// the list of operands. TC_RETURN, // Vector move to low scalar and zero higher vector elements. VZEXT_MOVL, // Vector integer truncate. VTRUNC, // Vector integer truncate with unsigned/signed saturation. VTRUNCUS, VTRUNCS, // Masked version of the above. Used when less than a 128-bit result is // produced since the mask only applies to the lower elements and can't // be represented by a select. // SRC, PASSTHRU, MASK VMTRUNC, VMTRUNCUS, VMTRUNCS, // Vector FP extend. VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE, // Vector FP round. VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND, // Masked version of above. Used for v2f64->v4f32. // SRC, PASSTHRU, MASK VMFPROUND, // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, // Vector shift elements VSHL, VSRL, VSRA, // Vector variable shift VSHLV, VSRLV, VSRAV, // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, // Shifts of mask registers. KSHIFTL, KSHIFTR, // Bit rotate by immediate VROTLI, VROTRI, // Vector packed double/float comparison. CMPP, // Vector integer comparisons. PCMPEQ, PCMPGT, // v8i16 Horizontal minimum and position. PHMINPOS, MULTISHIFT, /// Vector comparison generating mask bits for fp and /// integer signed and unsigned data types. CMPM, // Vector comparison with SAE for FP values CMPM_SAE, // Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, UMUL, OR, XOR, AND, // Bit field extract. BEXTR, // Zero High Bits Starting with Specified Bit Position. BZHI, // X86-specific multiply by immediate. MUL_IMM, // Vector sign bit extraction. MOVMSK, // Vector bitwise comparisons. PTEST, // Vector packed fp sign bitwise comparisons. TESTP, // OR/AND test for masks. KORTEST, KTEST, // ADD for masks. KADD, // Several flavors of instructions with vector shuffle behaviors. // Saturated signed/unnsigned packing. PACKSS, PACKUS, // Intra-lane alignr. PALIGNR, // AVX512 inter-lane alignr. VALIGN, PSHUFD, PSHUFHW, PSHUFLW, SHUFP, // VBMI2 Concat & Shift. VSHLD, VSHRD, VSHLDV, VSHRDV, //Shuffle Packed Values at 128-bit granularity. SHUF128, MOVDDUP, MOVSHDUP, MOVSLDUP, MOVLHPS, MOVHLPS, MOVSD, MOVSS, UNPCKL, UNPCKH, VPERMILPV, VPERMILPI, VPERMI, VPERM2X128, // Variable Permute (VPERM). // Res = VPERMV MaskV, V0 VPERMV, // 3-op Variable Permute (VPERMT2). // Res = VPERMV3 V0, MaskV, V1 VPERMV3, // Bitwise ternary logic. VPTERNLOG, // Fix Up Special Packed Float32/64 values. VFIXUPIMM, VFIXUPIMM_SAE, VFIXUPIMMS, VFIXUPIMMS_SAE, // Range Restriction Calculation For Packed Pairs of Float32/64 values. VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE, // Reduce - Perform Reduction Transformation on scalar\packed FP. VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE, // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. // Also used by the legacy (V)ROUND intrinsics where we mask out the // scaling part of the immediate. VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE, // Tests Types Of a FP Values for packed types. VFPCLASS, // Tests Types Of a FP Values for scalar types. VFPCLASSS, // Broadcast scalar to vector. VBROADCAST, // Broadcast mask to vector. VBROADCASTM, // Broadcast subvector to vector. SUBV_BROADCAST, /// SSE4A Extraction and Insertion. EXTRQI, INSERTQI, // XOP arithmetic/logical shifts. VPSHA, VPSHL, // XOP signed/unsigned integer comparisons. VPCOM, VPCOMU, // XOP packed permute bytes. VPPERM, // XOP two source permutation. VPERMIL2, // Vector multiply packed unsigned doubleword integers. PMULUDQ, // Vector multiply packed signed doubleword integers. PMULDQ, // Vector Multiply Packed UnsignedIntegers with Round and Scale. MULHRS, // Multiply and Add Packed Integers. VPMADDUBSW, VPMADDWD, // AVX512IFMA multiply and add. // NOTE: These are different than the instruction and perform // op0 x op1 + op2. VPMADD52L, VPMADD52H, // VNNI VPDPBUSD, VPDPBUSDS, VPDPWSSD, VPDPWSSDS, // FMA nodes. // We use the target independent ISD::FMA for the non-inverted case. FNMADD, FMSUB, FNMSUB, FMADDSUB, FMSUBADD, // FMA with rounding mode. FMADD_RND, FNMADD_RND, FMSUB_RND, FNMSUB_RND, FMADDSUB_RND, FMSUBADD_RND, // Compress and expand. COMPRESS, EXPAND, // Bits shuffle VPSHUFBITQMB, // Convert Unsigned/Integer to Floating-Point Value with rounding mode. SINT_TO_FP_RND, UINT_TO_FP_RND, SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP, SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND, // Vector float/double to signed/unsigned integer. CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND, // Scalar float/double to signed/unsigned integer. CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND, // Vector float/double to signed/unsigned integer with truncation. CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE, // Scalar float/double to signed/unsigned integer with truncation. CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE, // Vector signed/unsigned integer to float/double. CVTSI2P, CVTUI2P, // Masked versions of above. Used for v2f64->v4f32. // SRC, PASSTHRU, MASK MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI, MCVTSI2P, MCVTUI2P, // Vector float to bfloat16. // Convert TWO packed single data to one packed BF16 data CVTNE2PS2BF16, // Convert packed single data to packed BF16 data CVTNEPS2BF16, // Masked version of above. // SRC, PASSTHRU, MASK MCVTNEPS2BF16, // Dot product of BF16 pairs to accumulated into // packed single precision. DPBF16PS, // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, // Windows's _chkstk call to do stack probing. WIN_ALLOCA, // For allocating variable amounts of stack space when using // segmented stacks. Check if the current stacklet has enough space, and // falls back to heap allocation if not. SEG_ALLOCA, // Memory barriers. MEMBARRIER, MFENCE, // Store FP status word into i16 register. FNSTSW16r, // Store contents of %ah into %eflags. SAHF, // Get a random integer and indicate whether it is valid in CF. RDRAND, // Get a NIST SP800-90B & C compliant random integer and // indicate whether it is valid in CF. RDSEED, // Protection keys // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is // value for ECX. RDPKRU, WRPKRU, // SSE42 string comparisons. // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG // will emit one or two instructions based on which results are used. If // flags and index/mask this allows us to use a single instruction since // we won't have to pick and opcode for flags. Instead we can rely on the // DAG to CSE everything and decide at isel. PCMPISTR, PCMPESTR, // Test if in transactional execution. XTEST, // ERI instructions. RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE, RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE, // Conversions between float and half-float. CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE, // Masked version of above. // SRC, RND, PASSTHRU, MASK MCVTPS2PH, // Galois Field Arithmetic Instructions GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB, // LWP insert record. LWPINS, // User level wait UMWAIT, TPAUSE, // Enqueue Stores Instructions ENQCMD, ENQCMDS, // For avx512-vp2intersect VP2INTERSECT, // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, LCMPXCHG16_DAG, LCMPXCHG8_SAVE_EBX_DAG, LCMPXCHG16_SAVE_RBX_DAG, /// LOCK-prefixed arithmetic read-modify-write instructions. /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) LADD, LSUB, LOR, LXOR, LAND, // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, // extract_vector_elt, store. VEXTRACT_STORE, // Store FP control world into i16 memory. FNSTCW16m, /// This instruction implements FP_TO_SINT with the /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It /// has two inputs (token chain and address) and two outputs (int value /// and token chain). Memory VT specifies the type to store to. FP_TO_INT_IN_MEM, /// This instruction implements SINT_TO_FP with the /// integer source in memory and FP reg result. This corresponds to the /// X86::FILD*m instructions. It has two inputs (token chain and address) /// and two outputs (FP value and token chain). FILD_FLAG also produces a /// flag). The integer source type is specified by the memory VT. FILD, FILD_FLAG, /// This instruction implements a fp->int store from FP stack /// slots. This corresponds to the fist instruction. It takes a /// chain operand, value to store, address, and glue. The memory VT /// specifies the type to store as. FIST, /// This instruction implements an extending load to FP stack slots. /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain /// operand, and ptr to load from. The memory VT specifies the type to /// load from. FLD, /// This instruction implements a truncating store from FP stack /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a /// chain operand, value to store, address, and glue. The memory VT /// specifies the type to store as. FST, /// This instruction grabs the address of the next argument /// from a va_list. (reads and modifies the va_list in memory) VAARG_64, // Vector truncating store with unsigned/signed saturation VTRUNCSTOREUS, VTRUNCSTORES, // Vector truncating masked store with unsigned/signed saturation VMTRUNCSTOREUS, VMTRUNCSTORES, // X86 specific gather and scatter MGATHER, MSCATTER, // WARNING: Do not add anything in the end unless you want the node to // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all // opcodes will be thought as target memory ops! }; } // end namespace X86ISD /// Define some predicates that are used for node matching. namespace X86 { /// Returns true if Elt is a constant zero or floating point constant +0.0. bool isZeroNode(SDValue Elt); /// Returns true of the given offset can be /// fit into displacement field of the instruction. bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement = true); /// Determines whether the callee is required to pop its /// own arguments. Callee pop is necessary to support tail calls. bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool GuaranteeTCO); } // end namespace X86 //===--------------------------------------------------------------------===// // X86 Implementation of the TargetLowering interface class X86TargetLowering final : public TargetLowering { public: explicit X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI); unsigned getJumpTableEncoding() const override; bool useSoftFloat() const override; void markLibCallAttributes(MachineFunction *MF, unsigned CC, ArgListTy &Args) const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override { return MVT::i8; } const MCExpr * LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid, MCContext &Ctx) const override; /// Returns relocation base for the given PIC jumptable. SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const override; const MCExpr * getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const override; /// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contains are placed at 16-byte boundaries while the rest are at /// 4-byte boundaries. unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override; /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const override; /// Returns true if it's safe to use load / store of the /// specified type to expand memcpy / memset inline. This is mostly true /// for all types except for some special cases. For example, on X86 /// targets without SSE2 f64 load / store are done with fldl / fstpl which /// also does type conversion. Note the specified type doesn't have to be /// legal as the hook is used before type legalization. bool isSafeMemOpType(MVT VT) const override; /// Returns true if the target allows unaligned memory accesses of the /// specified type. Returns whether it is "fast" in the last argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const override; /// Provide custom lowering hooks for some operations. /// SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; /// Places new result values for the node in Results (their number /// and types must exactly match those of the original return values of /// the node), or leaves Results empty, which indicates that the node is not /// to be custom lowered after all. void LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; /// Replace the results of node with an illegal result /// type with new values built out of custom code. /// void ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; // Return true if it is profitable to combine a BUILD_VECTOR with a // stride-pattern to a shuffle and a truncate. // Example of such a combine: // v4i32 build_vector((extract_elt V, 1), // (extract_elt V, 3), // (extract_elt V, 5), // (extract_elt V, 7)) // --> // v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to // v4i64) bool isDesirableToCombineBuildVectorToShuffleTruncate( ArrayRef ShuffleMask, EVT SrcVT, EVT TruncVT) const override; /// Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 /// instruction encodings are longer and some i16 instructions are slow. bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override; /// Return true if the target has native support for the /// specified value type and it is 'desirable' to use the type. e.g. On x86 /// i16 is legal, but undesirable since i16 instruction encodings are longer /// and some i16 instructions are slow. bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; /// Do not merge vector stores after legalization because that may conflict /// with x86-specific store splitting optimizations. bool mergeStoresAfterLegalization(EVT MemVT) const override { return !MemVT.isVector(); } bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const override; bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; bool isCtlzFast() const override; bool hasBitPreservingFPLogic(EVT VT) const override { return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); } bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { // If the pair to store is a mixture of float and int values, we will // save two bitwise instructions and one float-to-int instruction and // increase one store instruction. There is potentially a more // significant benefit because it avoids the float->int domain switch // for input value. So It is more likely a win. if ((LTy.isFloatingPoint() && HTy.isInteger()) || (LTy.isInteger() && HTy.isFloatingPoint())) return true; // If the pair only contains int values, we will save two bitwise // instructions and increase one store instruction (costing one more // store buffer). Since the benefit is more blurred so we leave // such pair out until we get testcase to prove it is a win. return false; } bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; bool hasAndNotCompare(SDValue Y) const override; bool hasAndNot(SDValue Y) const override; bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override; bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override { // For vectors, we don't have a preference.. if (XVT.isVector()) return false; auto VTIsOk = [](EVT VT) -> bool { return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64; }; // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports. // XVT will be larger than KeptBitsVT. MVT KeptBitsVT = MVT::getIntegerVT(KeptBits); return VTIsOk(XVT) && VTIsOk(KeptBitsVT); } - bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { - if (DAG.getMachineFunction().getFunction().hasMinSize()) - return false; - return true; - } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; bool shouldSplatInsEltVarIndex(EVT VT) const override; bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { return VT.isScalarInteger(); } /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. MVT hasFastEqualityCompare(unsigned NumBits) const override; /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const override; /// Determine which of the bits specified in Mask are known to be either /// zero or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; /// Determine the number of bits in the operation that are sign bits. unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override; bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override; bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override; const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; SDValue unwrapAddress(SDValue N) const override; SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; bool ExpandInlineAsm(CallInst *CI) const override; ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override; const char *LowerXConstraint(EVT ConstraintVT) const override; /// Lower the specified operand into the Ops vector. If it is invalid, don't /// add anything to Ops. If hasMemory is true it means one of the asm /// constraint of the inline asm instruction being processed is 'm'. void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const override; unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "i") return InlineAsm::Constraint_i; else if (ConstraintCode == "o") return InlineAsm::Constraint_o; else if (ConstraintCode == "v") return InlineAsm::Constraint_v; else if (ConstraintCode == "X") return InlineAsm::Constraint_X; return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } /// Handle Lowering flag assembly outputs. SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override; /// Given a physical register constraint /// (e.g. {edx}), return the register number and the register class for the /// register. This should only be used for C_Register constraints. On /// error, this returns a register number of 0. std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; /// Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; /// Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can /// compare a register against the immediate without having to materialize /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; /// Return true if the specified immediate is legal /// add immediate, that is the target has add instructions which can /// add a register and the immediate without having to materialize /// the immediate into a register. bool isLegalAddImmediate(int64_t Imm) const override; bool isLegalStoreImmediate(int64_t Imm) const override; /// Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. /// If the AM is supported, the return value must be >= 0. /// If the AM is not supported, it returns a negative value. int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; bool isVectorShiftByScalarCheap(Type *Ty) const override; /// Add x86-specific opcodes to the default list. bool isBinOp(unsigned Opcode) const override; /// Returns true if the opcode is a commutative binary operation. bool isCommutativeBinOp(unsigned Opcode) const override; /// Return true if it's free to truncate a value of /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in /// register EAX to i16 by referencing its sub-register AX. bool isTruncateFree(Type *Ty1, Type *Ty2) const override; bool isTruncateFree(EVT VT1, EVT VT2) const override; bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; /// Return true if any actual instruction that defines a /// value of type Ty1 implicit zero-extends the value to Ty2 in the result /// register. This does not necessarily include registers defined in /// unknown ways, such as incoming arguments, or copies from unknown /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this /// does not necessarily apply to truncate instructions. e.g. on x86-64, /// all instructions that define 32-bit values implicit zero-extend the /// result out to 64 bits. bool isZExtFree(Type *Ty1, Type *Ty2) const override; bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; /// Return true if folding a vector load into ExtVal (a sign, zero, or any /// extend node) is profitable. bool isVectorLoadExtDesirable(SDValue) const override; /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this /// method returns true, otherwise fmuladd is expanded to fmul + fadd. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; /// Return true if it's profitable to narrow /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow /// from i32 to i8 but not from i32 to i16. bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; /// Given an intrinsic, checks if on the target the intrinsic will need to map /// to a MemIntrinsicNode (touches memory). If this is the case, it returns /// true and stores the intrinsic information into the IntrinsicInfo that was /// passed to the function. bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override; /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; /// Targets can use this to indicate that they only support *some* /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to /// be legal. bool isShuffleMaskLegal(ArrayRef Mask, EVT VT) const override; /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a /// constant pool entry. bool isVectorClearMaskLegal(ArrayRef Mask, EVT VT) const override; /// Returns true if lowering to a jump table is allowed. bool areJTsAllowed(const Function *Fn) const override; /// If true, then instruction selection should /// seek to shrink the FP constant of the specified type to a smaller type /// in order to save space and / or reduce runtime. bool ShouldShrinkFPConstant(EVT VT) const override { // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more // expensive than a straight movsd. On the other hand, it's important to // shrink long double fp constant since fldt is very slow. return !X86ScalarSSEf64 || VT == MVT::f80; } /// Return true if we believe it is correct and profitable to reduce the /// load node to a smaller type. bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override; /// Return true if the specified scalar FP type is computed in an SSE /// register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const override; bool convertSelectOfConstantsToMath(EVT VT) const override; bool decomposeMulByConstant(EVT VT, SDValue C) const override; bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, bool IsSigned) const override; /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; /// Scalar ops always have equal or better analysis/performance/power than /// the vector equivalent, so this always makes sense if the scalar op is /// supported. bool shouldScalarizeBinop(SDValue) const override; /// Extract of a scalar FP value from index 0 of a vector is free. bool isExtractVecEltCheap(EVT VT, unsigned Index) const override { EVT EltVT = VT.getScalarType(); return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; } /// Overflow nodes should get combined/lowered to optimal instructions /// (they should allow eliminating explicit compares by getting flags from /// math ops). bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override; bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AddrSpace) const override { // If we can replace more than 2 scalar stores, there will be a reduction // in instructions even after we add a vector constant load. return NumElem > 2; } bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override; /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { return nullptr; // nothing to do, move along. } unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. unsigned getExceptionPointerRegister(const Constant *PersonalityFn) const override; /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. unsigned getExceptionSelectorRegister(const Constant *PersonalityFn) const override; virtual bool needsFixedCatchObjects() const override; /// This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; /// If the target has a standard location for the stack protector cookie, /// returns the address of that location. Otherwise, returns nullptr. Value *getIRStackGuard(IRBuilder<> &IRB) const override; bool useLoadStackGuardNode() const override; bool useStackGuardXorFP() const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; Function *getSSPStackGuardCheck(const Module &M) const override; SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override; /// Return true if the target stores SafeStack pointer at a fixed offset in /// some non-standard address space, and populates the address space and /// offset as appropriate. Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; /// Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool supportSwiftError() const override; StringRef getStackProbeSymbolName(MachineFunction &MF) const override; bool hasVectorBlend() const override { return true; } unsigned getMaxSupportedInterleaveFactor() const override { return 4; } /// Lower interleaved load(s) into target specific /// instructions/intrinsics. bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, SDValue Addr, SelectionDAG &DAG) const override; protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override; private: /// Keep a reference to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget &Subtarget; /// Select between SSE or x87 floating point ops. /// When SSE is available, use it for f32 operations. /// When SSE2 is available, use it for f64 operations. bool X86ScalarSSEf32; bool X86ScalarSSEf64; /// A list of legal FP immediates. std::vector LegalFPImmediates; /// Indicate that this x86 target can instruction /// select the specified FP immediate natively. void addLegalFPImmediate(const APFloat& Imm) { LegalFPImmediates.push_back(Imm); } SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, uint32_t *RegMask) const; SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl &ArgInfo, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo &MFI, unsigned i) const; SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const; // Call lowering helpers. /// Check whether the call is eligible for tail call optimization. Targets /// that want to do tail call optimization should implement this function. bool IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG& DAG) const; SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, const SDLoc &dl) const; unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG) const; unsigned getAddressSpace(void) const; SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr, const unsigned char OpFlags = 0) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; /// Creates target global address or external symbol nodes for calls or /// other uses. SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override; bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getFunction().hasFnAttribute(Attribute::NoUnwind); } void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const override; bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const override; const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *SI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; bool needsCmpXchgNb(Type *MemType) const; void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const; // Utility function to emit the low-level va_arg code for X86-64. MachineBasicBlock * EmitVAARG64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const; /// Utility function to emit the xmm reg save portion of va_start. MachineBasicBlock * EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1, MachineInstr &MI2, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const; void emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const; MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const; MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const; MachineBasicBlock *emitFMA3Instr(MachineInstr &MI, MachineBasicBlock *MBB) const; MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const; /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent, for use with the given x86 condition code. SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG) const; /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; /// Emit flags for the given setcc condition and operands. Also returns the /// corresponding X86 condition code constant in X86CC. SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, SDValue &X86CC) const; /// Check if replacement of SQRT with RSQRT should be disabled. bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override; /// Use rsqrt* to speed up sqrt calculations. SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override; /// Use rcp* to speed up fdiv calculations. SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override; /// Reassociate floating point divisions into multiply by reciprocal. unsigned combineRepeatedFPDivisors() const override; }; namespace X86 { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } // end namespace X86 // Base class for all X86 non-masked store operations. class X86StoreSDNode : public MemSDNode { public: X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {} const SDValue &getValue() const { return getOperand(1); } const SDValue &getBasePtr() const { return getOperand(2); } static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::VTRUNCSTORES || N->getOpcode() == X86ISD::VTRUNCSTOREUS; } }; // Base class for all X86 masked store operations. // The class has the same order of operands as MaskedStoreSDNode for // convenience. class X86MaskedStoreSDNode : public MemSDNode { public: X86MaskedStoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {} const SDValue &getValue() const { return getOperand(1); } const SDValue &getBasePtr() const { return getOperand(2); } const SDValue &getMask() const { return getOperand(3); } static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::VMTRUNCSTORES || N->getOpcode() == X86ISD::VMTRUNCSTOREUS; } }; // X86 Truncating Store with Signed saturation. class TruncSStoreSDNode : public X86StoreSDNode { public: TruncSStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {} static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::VTRUNCSTORES; } }; // X86 Truncating Store with Unsigned saturation. class TruncUSStoreSDNode : public X86StoreSDNode { public: TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {} static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::VTRUNCSTOREUS; } }; // X86 Truncating Masked Store with Signed saturation. class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode { public: MaskedTruncSStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {} static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::VMTRUNCSTORES; } }; // X86 Truncating Masked Store with Unsigned saturation. class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode { public: MaskedTruncUSStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {} static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::VMTRUNCSTOREUS; } }; // X86 specific Gather/Scatter nodes. // The class has the same order of operands as MaskedGatherScatterSDNode for // convenience. class X86MaskedGatherScatterSDNode : public MemSDNode { public: X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {} const SDValue &getBasePtr() const { return getOperand(3); } const SDValue &getIndex() const { return getOperand(4); } const SDValue &getMask() const { return getOperand(2); } const SDValue &getScale() const { return getOperand(5); } static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::MGATHER || N->getOpcode() == X86ISD::MSCATTER; } }; class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode { public: X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, MMO) {} const SDValue &getPassThru() const { return getOperand(1); } static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::MGATHER; } }; class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode { public: X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT, MMO) {} const SDValue &getValue() const { return getOperand(1); } static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::MSCATTER; } }; /// Generate unpacklo/unpackhi shuffle mask. template void createUnpackShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Lo, bool Unary) { assert(Mask.empty() && "Expected an empty shuffle mask vector"); int NumElts = VT.getVectorNumElements(); int NumEltsInLane = 128 / VT.getScalarSizeInBits(); for (int i = 0; i < NumElts; ++i) { unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; int Pos = (i % NumEltsInLane) / 2 + LaneStart; Pos += (Unary ? 0 : NumElts * (i % 2)); Pos += (Lo ? 0 : NumEltsInLane / 2); Mask.push_back(Pos); } } /// Helper function to scale a shuffle or target shuffle mask, replacing each /// mask index with the scaled sequential indices for an equivalent narrowed /// mask. This is the reverse process to canWidenShuffleElements, but can /// always succeed. template void scaleShuffleMask(int Scale, ArrayRef Mask, SmallVectorImpl &ScaledMask) { assert(0 < Scale && "Unexpected scaling factor"); size_t NumElts = Mask.size(); ScaledMask.assign(NumElts * Scale, -1); for (int i = 0; i != (int)NumElts; ++i) { int M = Mask[i]; // Repeat sentinel values in every mask element. if (M < 0) { for (int s = 0; s != Scale; ++s) ScaledMask[(Scale * i) + s] = M; continue; } // Scale mask element and increment across each mask element. for (int s = 0; s != Scale; ++s) ScaledMask[(Scale * i) + s] = (Scale * M) + s; } } } // end namespace llvm #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H Index: vendor/llvm/dist-release_90/lib/Target/X86/X86Subtarget.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Target/X86/X86Subtarget.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Target/X86/X86Subtarget.cpp (revision 351709) @@ -1,377 +1,380 @@ //===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the X86 specific subclass of TargetSubtargetInfo. // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86CallLowering.h" #include "X86LegalizerInfo.h" #include "X86MacroFusion.h" #include "X86RegisterBankInfo.h" #include "X86Subtarget.h" #include "MCTargetDesc/X86BaseInfo.h" #include "X86TargetMachine.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #if defined(_MSC_VER) #include #endif using namespace llvm; #define DEBUG_TYPE "subtarget" #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR #include "X86GenSubtargetInfo.inc" // Temporary option to control early if-conversion for x86 while adding machine // models. static cl::opt X86EarlyIfConv("x86-early-ifcvt", cl::Hidden, cl::desc("Enable early if-conversion on X86")); /// Classify a blockaddress reference for the current subtarget according to how /// we should reference it in a non-pcrel context. unsigned char X86Subtarget::classifyBlockAddressReference() const { return classifyLocalReference(nullptr); } /// Classify a global variable reference for the current subtarget according to /// how we should reference it in a non-pcrel context. unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const { return classifyGlobalReference(GV, *GV->getParent()); } unsigned char X86Subtarget::classifyLocalReference(const GlobalValue *GV) const { // If we're not PIC, it's not very interesting. if (!isPositionIndependent()) return X86II::MO_NO_FLAG; if (is64Bit()) { // 64-bit ELF PIC local references may use GOTOFF relocations. if (isTargetELF()) { switch (TM.getCodeModel()) { // 64-bit small code model is simple: All rip-relative. case CodeModel::Tiny: llvm_unreachable("Tiny codesize model not supported on X86"); case CodeModel::Small: case CodeModel::Kernel: return X86II::MO_NO_FLAG; // The large PIC code model uses GOTOFF. case CodeModel::Large: return X86II::MO_GOTOFF; // Medium is a hybrid: RIP-rel for code, GOTOFF for DSO local data. case CodeModel::Medium: if (isa(GV)) return X86II::MO_NO_FLAG; // All code is RIP-relative return X86II::MO_GOTOFF; // Local symbols use GOTOFF. } llvm_unreachable("invalid code model"); } // Otherwise, this is either a RIP-relative reference or a 64-bit movabsq, // both of which use MO_NO_FLAG. return X86II::MO_NO_FLAG; } // The COFF dynamic linker just patches the executable sections. if (isTargetCOFF()) return X86II::MO_NO_FLAG; if (isTargetDarwin()) { // 32 bit macho has no relocation for a-b if a is undefined, even if // b is in the section that is being relocated. // This means we have to use o load even for GVs that are known to be // local to the dso. if (GV && (GV->isDeclarationForLinker() || GV->hasCommonLinkage())) return X86II::MO_DARWIN_NONLAZY_PIC_BASE; return X86II::MO_PIC_BASE_OFFSET; } return X86II::MO_GOTOFF; } unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV, const Module &M) const { // The static large model never uses stubs. if (TM.getCodeModel() == CodeModel::Large && !isPositionIndependent()) return X86II::MO_NO_FLAG; // Absolute symbols can be referenced directly. if (GV) { if (Optional CR = GV->getAbsoluteSymbolRange()) { // See if we can use the 8-bit immediate form. Note that some instructions // will sign extend the immediate operand, so to be conservative we only // accept the range [0,128). if (CR->getUnsignedMax().ult(128)) return X86II::MO_ABS8; else return X86II::MO_NO_FLAG; } } if (TM.shouldAssumeDSOLocal(M, GV)) return classifyLocalReference(GV); if (isTargetCOFF()) { if (GV->hasDLLImportStorageClass()) return X86II::MO_DLLIMPORT; return X86II::MO_COFFSTUB; } + // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables. + if (isOSWindows()) + return X86II::MO_NO_FLAG; if (is64Bit()) { // ELF supports a large, truly PIC code model with non-PC relative GOT // references. Other object file formats do not. Use the no-flag, 64-bit // reference for them. if (TM.getCodeModel() == CodeModel::Large) return isTargetELF() ? X86II::MO_GOT : X86II::MO_NO_FLAG; return X86II::MO_GOTPCREL; } if (isTargetDarwin()) { if (!isPositionIndependent()) return X86II::MO_DARWIN_NONLAZY; return X86II::MO_DARWIN_NONLAZY_PIC_BASE; } return X86II::MO_GOT; } unsigned char X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const { return classifyGlobalFunctionReference(GV, *GV->getParent()); } unsigned char X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const { if (TM.shouldAssumeDSOLocal(M, GV)) return X86II::MO_NO_FLAG; // Functions on COFF can be non-DSO local for two reasons: // - They are marked dllimport // - They are extern_weak, and a stub is needed if (isTargetCOFF()) { if (GV->hasDLLImportStorageClass()) return X86II::MO_DLLIMPORT; return X86II::MO_COFFSTUB; } const Function *F = dyn_cast_or_null(GV); if (isTargetELF()) { if (is64Bit() && F && (CallingConv::X86_RegCall == F->getCallingConv())) // According to psABI, PLT stub clobbers XMM8-XMM15. // In Regcall calling convention those registers are used for passing // parameters. Thus we need to prevent lazy binding in Regcall. return X86II::MO_GOTPCREL; // If PLT must be avoided then the call should be via GOTPCREL. if (((F && F->hasFnAttribute(Attribute::NonLazyBind)) || (!F && M.getRtLibUseGOT())) && is64Bit()) return X86II::MO_GOTPCREL; return X86II::MO_PLT; } if (is64Bit()) { if (F && F->hasFnAttribute(Attribute::NonLazyBind)) // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). return X86II::MO_GOTPCREL; return X86II::MO_NO_FLAG; } return X86II::MO_NO_FLAG; } /// Return true if the subtarget allows calls to immediate address. bool X86Subtarget::isLegalToCallImmediateAddr() const { // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32 // but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does, // the following check for Win32 should be removed. if (In64BitMode || isTargetWin32()) return false; return isTargetELF() || TM.getRelocationModel() == Reloc::Static; } void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { std::string CPUName = CPU; if (CPUName.empty()) CPUName = "generic"; std::string FullFS = FS; if (In64BitMode) { // SSE2 should default to enabled in 64-bit mode, but can be turned off // explicitly. if (!FullFS.empty()) FullFS = "+sse2," + FullFS; else FullFS = "+sse2"; // If no CPU was specified, enable 64bit feature to satisy later check. if (CPUName == "generic") { if (!FullFS.empty()) FullFS = "+64bit," + FullFS; else FullFS = "+64bit"; } } // LAHF/SAHF are always supported in non-64-bit mode. if (!In64BitMode) { if (!FullFS.empty()) FullFS = "+sahf," + FullFS; else FullFS = "+sahf"; } // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of // 16-bytes and under that are reasonably fast. These features were // introduced with Intel's Nehalem/Silvermont and AMD's Family10h // micro-architectures respectively. if (hasSSE42() || hasSSE4A()) IsUAMem16Slow = false; // It's important to keep the MCSubtargetInfo feature bits in sync with // target data structure which is shared with MC code emitter, etc. if (In64BitMode) ToggleFeature(X86::Mode64Bit); else if (In32BitMode) ToggleFeature(X86::Mode32Bit); else if (In16BitMode) ToggleFeature(X86::Mode16Bit); else llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!"); LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel << ", 3DNowLevel " << X863DNowLevel << ", 64bit " << HasX86_64 << "\n"); if (In64BitMode && !HasX86_64) report_fatal_error("64-bit code requested on a subtarget that doesn't " "support it!"); // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both // 32 and 64 bit) and for all 64-bit targets. if (StackAlignOverride) stackAlignment = StackAlignOverride; else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() || isTargetKFreeBSD() || In64BitMode) stackAlignment = 16; // Some CPUs have more overhead for gather. The specified overhead is relative // to the Load operation. "2" is the number provided by Intel architects. This // parameter is used for cost estimation of Gather Op and comparison with // other alternatives. // TODO: Remove the explicit hasAVX512()?, That would mean we would only // enable gather with a -march. if (hasAVX512() || (hasAVX2() && hasFastGather())) GatherOverhead = 2; if (hasAVX512()) ScatterOverhead = 2; // Consume the vector width attribute or apply any target specific limit. if (PreferVectorWidthOverride) PreferVectorWidth = PreferVectorWidthOverride; else if (Prefer256Bit) PreferVectorWidth = 256; } X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { initSubtargetFeatures(CPU, FS); return *this; } X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride, unsigned PreferVectorWidthOverride, unsigned RequiredVectorWidth) : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::None), TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), RequiredVectorWidth(RequiredVectorWidth), In64BitMode(TargetTriple.getArch() == Triple::x86_64), In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), In16BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() == Triple::CODE16), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. if (!isPositionIndependent()) setPICStyle(PICStyles::None); else if (is64Bit()) setPICStyle(PICStyles::RIPRel); else if (isTargetCOFF()) setPICStyle(PICStyles::None); else if (isTargetDarwin()) setPICStyle(PICStyles::StubPIC); else if (isTargetELF()) setPICStyle(PICStyles::GOT); CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering())); Legalizer.reset(new X86LegalizerInfo(*this, TM)); auto *RBI = new X86RegisterBankInfo(*getRegisterInfo()); RegBankInfo.reset(RBI); InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI)); } const CallLowering *X86Subtarget::getCallLowering() const { return CallLoweringInfo.get(); } const InstructionSelector *X86Subtarget::getInstructionSelector() const { return InstSelector.get(); } const LegalizerInfo *X86Subtarget::getLegalizerInfo() const { return Legalizer.get(); } const RegisterBankInfo *X86Subtarget::getRegBankInfo() const { return RegBankInfo.get(); } bool X86Subtarget::enableEarlyIfConversion() const { return hasCMov() && X86EarlyIfConv; } void X86Subtarget::getPostRAMutations( std::vector> &Mutations) const { Mutations.push_back(createX86MacroFusionDAGMutation()); } Index: vendor/llvm/dist-release_90/lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- vendor/llvm/dist-release_90/lib/Transforms/Utils/LoopUnroll.cpp (revision 351708) +++ vendor/llvm/dist-release_90/lib/Transforms/Utils/LoopUnroll.cpp (revision 351709) @@ -1,976 +1,978 @@ //===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements some loop unrolling utilities. It does not define any // actual pass or policy, but provides a single function to perform loop // unrolling. // // The process of unrolling can produce extraneous basic blocks linked with // unconditional branches. This will be corrected in the future. // //===----------------------------------------------------------------------===// #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Transforms/Utils/UnrollLoop.h" using namespace llvm; #define DEBUG_TYPE "loop-unroll" // TODO: Should these be here or in LoopUnroll? STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled"); STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)"); STATISTIC(NumUnrolledWithHeader, "Number of loops unrolled without a " "conditional latch (completely or otherwise)"); static cl::opt UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden, cl::desc("Allow runtime unrolled loops to be unrolled " "with epilog instead of prolog.")); static cl::opt UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden, cl::desc("Verify domtree after unrolling"), #ifdef EXPENSIVE_CHECKS cl::init(true) #else cl::init(false) #endif ); /// Convert the instruction operands from referencing the current values into /// those specified by VMap. void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) { for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) { Value *Op = I->getOperand(op); // Unwrap arguments of dbg.value intrinsics. bool Wrapped = false; if (auto *V = dyn_cast(Op)) if (auto *Unwrapped = dyn_cast(V->getMetadata())) { Op = Unwrapped->getValue(); Wrapped = true; } auto wrap = [&](Value *V) { auto &C = I->getContext(); return Wrapped ? MetadataAsValue::get(C, ValueAsMetadata::get(V)) : V; }; ValueToValueMapTy::iterator It = VMap.find(Op); if (It != VMap.end()) I->setOperand(op, wrap(It->second)); } if (PHINode *PN = dyn_cast(I)) { for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i)); if (It != VMap.end()) PN->setIncomingBlock(i, cast(It->second)); } } } /// Check if unrolling created a situation where we need to insert phi nodes to /// preserve LCSSA form. /// \param Blocks is a vector of basic blocks representing unrolled loop. /// \param L is the outer loop. /// It's possible that some of the blocks are in L, and some are not. In this /// case, if there is a use is outside L, and definition is inside L, we need to /// insert a phi-node, otherwise LCSSA will be broken. /// The function is just a helper function for llvm::UnrollLoop that returns /// true if this situation occurs, indicating that LCSSA needs to be fixed. static bool needToInsertPhisForLCSSA(Loop *L, std::vector Blocks, LoopInfo *LI) { for (BasicBlock *BB : Blocks) { if (LI->getLoopFor(BB) == L) continue; for (Instruction &I : *BB) { for (Use &U : I.operands()) { if (auto Def = dyn_cast(U)) { Loop *DefLoop = LI->getLoopFor(Def->getParent()); if (!DefLoop) continue; if (DefLoop->contains(L)) return true; } } } } return false; } /// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary /// and adds a mapping from the original loop to the new loop to NewLoops. /// Returns nullptr if no new loop was created and a pointer to the /// original loop OriginalBB was part of otherwise. const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB, BasicBlock *ClonedBB, LoopInfo *LI, NewLoopsMap &NewLoops) { // Figure out which loop New is in. const Loop *OldLoop = LI->getLoopFor(OriginalBB); assert(OldLoop && "Should (at least) be in the loop being unrolled!"); Loop *&NewLoop = NewLoops[OldLoop]; if (!NewLoop) { // Found a new sub-loop. assert(OriginalBB == OldLoop->getHeader() && "Header should be first in RPO"); NewLoop = LI->AllocateLoop(); Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop()); if (NewLoopParent) NewLoopParent->addChildLoop(NewLoop); else LI->addTopLevelLoop(NewLoop); NewLoop->addBasicBlockToLoop(ClonedBB, *LI); return OldLoop; } else { NewLoop->addBasicBlockToLoop(ClonedBB, *LI); return nullptr; } } /// The function chooses which type of unroll (epilog or prolog) is more /// profitabale. /// Epilog unroll is more profitable when there is PHI that starts from /// constant. In this case epilog will leave PHI start from constant, /// but prolog will convert it to non-constant. /// /// loop: /// PN = PHI [I, Latch], [CI, PreHeader] /// I = foo(PN) /// ... /// /// Epilog unroll case. /// loop: /// PN = PHI [I2, Latch], [CI, PreHeader] /// I1 = foo(PN) /// I2 = foo(I1) /// ... /// Prolog unroll case. /// NewPN = PHI [PrologI, Prolog], [CI, PreHeader] /// loop: /// PN = PHI [I2, Latch], [NewPN, PreHeader] /// I1 = foo(PN) /// I2 = foo(I1) /// ... /// static bool isEpilogProfitable(Loop *L) { BasicBlock *PreHeader = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); assert(PreHeader && Header); for (const PHINode &PN : Header->phis()) { if (isa(PN.getIncomingValueForBlock(PreHeader))) return true; } return false; } /// Perform some cleanup and simplifications on loops after unrolling. It is /// useful to simplify the IV's in the new loop, as well as do a quick /// simplify/dce pass of the instructions. void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC) { // Simplify any new induction variables in the partially unrolled loop. if (SE && SimplifyIVs) { SmallVector DeadInsts; simplifyLoopIVs(L, SE, DT, LI, DeadInsts); // Aggressively clean up dead instructions that simplifyLoopIVs already // identified. Any remaining should be cleaned up below. while (!DeadInsts.empty()) if (Instruction *Inst = dyn_cast_or_null(&*DeadInsts.pop_back_val())) RecursivelyDeleteTriviallyDeadInstructions(Inst); } // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); for (BasicBlock *BB : L->getBlocks()) { for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { Instruction *Inst = &*I++; if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC})) if (LI->replacementPreservesLCSSAForm(Inst, V)) Inst->replaceAllUsesWith(V); if (isInstructionTriviallyDead(Inst)) BB->getInstList().erase(Inst); } } // TODO: after peeling or unrolling, previously loop variant conditions are // likely to fold to constants, eagerly propagating those here will require // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be // appropriate. } /// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling /// can only fail when the loop's latch block is not terminated by a conditional /// branch instruction. However, if the trip count (and multiple) are not known, /// loop unrolling will mostly produce more code that is no faster. /// /// TripCount is the upper bound of the iteration on which control exits /// LatchBlock. Control may exit the loop prior to TripCount iterations either /// via an early branch in other loop block or via LatchBlock terminator. This /// is relaxed from the general definition of trip count which is the number of /// times the loop header executes. Note that UnrollLoop assumes that the loop /// counter test is in LatchBlock in order to remove unnecesssary instances of /// the test. If control can exit the loop from the LatchBlock's terminator /// prior to TripCount iterations, flag PreserveCondBr needs to be set. /// /// PreserveCondBr indicates whether the conditional branch of the LatchBlock /// needs to be preserved. It is needed when we use trip count upper bound to /// fully unroll the loop. If PreserveOnlyFirst is also set then only the first /// conditional branch needs to be preserved. /// /// Similarly, TripMultiple divides the number of times that the LatchBlock may /// execute without exiting the loop. /// /// If AllowRuntime is true then UnrollLoop will consider unrolling loops that /// have a runtime (i.e. not compile time constant) trip count. Unrolling these /// loops require a unroll "prologue" that runs "RuntimeTripCount % Count" /// iterations before branching into the unrolled loop. UnrollLoop will not /// runtime-unroll the loop if computing RuntimeTripCount will be expensive and /// AllowExpensiveTripCount is false. /// /// If we want to perform PGO-based loop peeling, PeelCount is set to the /// number of iterations we want to peel off. /// /// The LoopInfo Analysis that is passed will be kept consistent. /// /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and /// DominatorTree if they are non-null. /// /// If RemainderLoop is non-null, it will receive the remainder loop (if /// required and not fully unrolled). LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); return LoopUnrollResult::Unmodified; } BasicBlock *LatchBlock = L->getLoopLatch(); if (!LatchBlock) { LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n"); return LoopUnrollResult::Unmodified; } // Loops with indirectbr cannot be cloned. if (!L->isSafeToClone()) { LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n"); return LoopUnrollResult::Unmodified; } // The current loop unroll pass can unroll loops with a single latch or header // that's a conditional branch exiting the loop. // FIXME: The implementation can be extended to work with more complicated // cases, e.g. loops with multiple latches. BasicBlock *Header = L->getHeader(); BranchInst *HeaderBI = dyn_cast(Header->getTerminator()); BranchInst *BI = dyn_cast(LatchBlock->getTerminator()); // FIXME: Support loops without conditional latch and multiple exiting blocks. if (!BI || (BI->isUnconditional() && (!HeaderBI || HeaderBI->isUnconditional() || L->getExitingBlock() != Header))) { LLVM_DEBUG(dbgs() << " Can't unroll; loop not terminated by a conditional " "branch in the latch or header.\n"); return LoopUnrollResult::Unmodified; } auto CheckLatchSuccessors = [&](unsigned S1, unsigned S2) { return BI->isConditional() && BI->getSuccessor(S1) == Header && !L->contains(BI->getSuccessor(S2)); }; // If we have a conditional latch, it must exit the loop. if (BI && BI->isConditional() && !CheckLatchSuccessors(0, 1) && !CheckLatchSuccessors(1, 0)) { LLVM_DEBUG( dbgs() << "Can't unroll; a conditional latch must exit the loop"); return LoopUnrollResult::Unmodified; } auto CheckHeaderSuccessors = [&](unsigned S1, unsigned S2) { return HeaderBI && HeaderBI->isConditional() && L->contains(HeaderBI->getSuccessor(S1)) && !L->contains(HeaderBI->getSuccessor(S2)); }; // If we do not have a conditional latch, the header must exit the loop. if (BI && !BI->isConditional() && HeaderBI && HeaderBI->isConditional() && !CheckHeaderSuccessors(0, 1) && !CheckHeaderSuccessors(1, 0)) { LLVM_DEBUG(dbgs() << "Can't unroll; conditional header must exit the loop"); return LoopUnrollResult::Unmodified; } if (Header->hasAddressTaken()) { // The loop-rotate pass can be helpful to avoid this in many cases. LLVM_DEBUG( dbgs() << " Won't unroll loop: address of header block is taken.\n"); return LoopUnrollResult::Unmodified; } if (ULO.TripCount != 0) LLVM_DEBUG(dbgs() << " Trip Count = " << ULO.TripCount << "\n"); if (ULO.TripMultiple != 1) LLVM_DEBUG(dbgs() << " Trip Multiple = " << ULO.TripMultiple << "\n"); // Effectively "DCE" unrolled iterations that are beyond the tripcount // and will never be executed. if (ULO.TripCount != 0 && ULO.Count > ULO.TripCount) ULO.Count = ULO.TripCount; // Don't enter the unroll code if there is nothing to do. if (ULO.TripCount == 0 && ULO.Count < 2 && ULO.PeelCount == 0) { LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n"); return LoopUnrollResult::Unmodified; } assert(ULO.Count > 0); assert(ULO.TripMultiple > 0); assert(ULO.TripCount == 0 || ULO.TripCount % ULO.TripMultiple == 0); // Are we eliminating the loop control altogether? bool CompletelyUnroll = ULO.Count == ULO.TripCount; SmallVector ExitBlocks; L->getExitBlocks(ExitBlocks); std::vector OriginalLoopBlocks = L->getBlocks(); // Go through all exits of L and see if there are any phi-nodes there. We just // conservatively assume that they're inserted to preserve LCSSA form, which // means that complete unrolling might break this form. We need to either fix // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For // now we just recompute LCSSA for the outer loop, but it should be possible // to fix it in-place. bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll && any_of(ExitBlocks, [](const BasicBlock *BB) { return isa(BB->begin()); }); // We assume a run-time trip count if the compiler cannot // figure out the loop trip count and the unroll-runtime // flag is specified. bool RuntimeTripCount = (ULO.TripCount == 0 && ULO.Count > 0 && ULO.AllowRuntime); assert((!RuntimeTripCount || !ULO.PeelCount) && "Did not expect runtime trip-count unrolling " "and peeling for the same loop"); bool Peeled = false; if (ULO.PeelCount) { Peeled = peelLoop(L, ULO.PeelCount, LI, SE, DT, AC, PreserveLCSSA); // Successful peeling may result in a change in the loop preheader/trip // counts. If we later unroll the loop, we want these to be updated. if (Peeled) { // According to our guards and profitability checks the only // meaningful exit should be latch block. Other exits go to deopt, // so we do not worry about them. BasicBlock *ExitingBlock = L->getLoopLatch(); assert(ExitingBlock && "Loop without exiting block?"); assert(L->isLoopExiting(ExitingBlock) && "Latch is not exiting?"); Preheader = L->getLoopPreheader(); ULO.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); ULO.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); } } // Loops containing convergent instructions must have a count that divides // their TripMultiple. LLVM_DEBUG( { bool HasConvergent = false; for (auto &BB : L->blocks()) for (auto &I : *BB) if (auto CS = CallSite(&I)) HasConvergent |= CS.isConvergent(); assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) && "Unroll count must divide trip multiple if loop contains a " "convergent operation."); }); bool EpilogProfitability = UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog : isEpilogProfitable(L); if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 && !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability, ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, PreserveLCSSA, RemainderLoop)) { if (ULO.Force) RuntimeTripCount = false; else { LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be " "generated when assuming runtime trip count\n"); return LoopUnrollResult::Unmodified; } } // If we know the trip count, we know the multiple... unsigned BreakoutTrip = 0; if (ULO.TripCount != 0) { BreakoutTrip = ULO.TripCount % ULO.Count; ULO.TripMultiple = 0; } else { // Figure out what multiple to use. BreakoutTrip = ULO.TripMultiple = (unsigned)GreatestCommonDivisor64(ULO.Count, ULO.TripMultiple); } using namespace ore; // Report the unrolling decision. if (CompletelyUnroll) { LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName() << " with trip count " << ULO.TripCount << "!\n"); if (ORE) ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), L->getHeader()) << "completely unrolled loop with " << NV("UnrollCount", ULO.TripCount) << " iterations"; }); } else if (ULO.PeelCount) { LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName() << " with iteration count " << ULO.PeelCount << "!\n"); if (ORE) ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(), L->getHeader()) << " peeled loop by " << NV("PeelCount", ULO.PeelCount) << " iterations"; }); } else { auto DiagBuilder = [&]() { OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), L->getHeader()); return Diag << "unrolled loop by a factor of " << NV("UnrollCount", ULO.Count); }; LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by " << ULO.Count); if (ULO.TripMultiple == 0 || BreakoutTrip != ULO.TripMultiple) { LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip); if (ORE) ORE->emit([&]() { return DiagBuilder() << " with a breakout at trip " << NV("BreakoutTrip", BreakoutTrip); }); } else if (ULO.TripMultiple != 1) { LLVM_DEBUG(dbgs() << " with " << ULO.TripMultiple << " trips per branch"); if (ORE) ORE->emit([&]() { return DiagBuilder() << " with " << NV("TripMultiple", ULO.TripMultiple) << " trips per branch"; }); } else if (RuntimeTripCount) { LLVM_DEBUG(dbgs() << " with run-time trip count"); if (ORE) ORE->emit( [&]() { return DiagBuilder() << " with run-time trip count"; }); } LLVM_DEBUG(dbgs() << "!\n"); } // We are going to make changes to this loop. SCEV may be keeping cached info // about it, in particular about backedge taken count. The changes we make // are guaranteed to invalidate this information for our loop. It is tempting // to only invalidate the loop being unrolled, but it is incorrect as long as // all exiting branches from all inner loops have impact on the outer loops, // and if something changes inside them then any of outer loops may also // change. When we forget outermost loop, we also forget all contained loops // and this is what we need here. if (SE) { if (ULO.ForgetAllSCEV) SE->forgetAllLoops(); else SE->forgetTopmostLoop(L); } bool ContinueOnTrue; bool LatchIsExiting = BI->isConditional(); BasicBlock *LoopExit = nullptr; if (LatchIsExiting) { ContinueOnTrue = L->contains(BI->getSuccessor(0)); LoopExit = BI->getSuccessor(ContinueOnTrue); } else { NumUnrolledWithHeader++; ContinueOnTrue = L->contains(HeaderBI->getSuccessor(0)); LoopExit = HeaderBI->getSuccessor(ContinueOnTrue); } // For the first iteration of the loop, we should use the precloned values for // PHI nodes. Insert associations now. ValueToValueMapTy LastValueMap; std::vector OrigPHINode; for (BasicBlock::iterator I = Header->begin(); isa(I); ++I) { OrigPHINode.push_back(cast(I)); } std::vector Headers; std::vector HeaderSucc; std::vector Latches; Headers.push_back(Header); Latches.push_back(LatchBlock); if (!LatchIsExiting) { auto *Term = cast(Header->getTerminator()); if (Term->isUnconditional() || L->contains(Term->getSuccessor(0))) { assert(L->contains(Term->getSuccessor(0))); HeaderSucc.push_back(Term->getSuccessor(0)); } else { assert(L->contains(Term->getSuccessor(1))); HeaderSucc.push_back(Term->getSuccessor(1)); } } // The current on-the-fly SSA update requires blocks to be processed in // reverse postorder so that LastValueMap contains the correct value at each // exit. LoopBlocksDFS DFS(L); DFS.perform(LI); // Stash the DFS iterators before adding blocks to the loop. LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); std::vector UnrolledLoopBlocks = L->getBlocks(); // Loop Unrolling might create new loops. While we do preserve LoopInfo, we // might break loop-simplified form for these loops (as they, e.g., would // share the same exit blocks). We'll keep track of loops for which we can // break this so that later we can re-simplify them. SmallSetVector LoopsToSimplify; for (Loop *SubLoop : *L) LoopsToSimplify.insert(SubLoop); if (Header->getParent()->isDebugInfoForProfiling()) for (BasicBlock *BB : L->getBlocks()) for (Instruction &I : *BB) if (!isa(&I)) if (const DILocation *DIL = I.getDebugLoc()) { auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(ULO.Count); if (NewDIL) I.setDebugLoc(NewDIL.getValue()); else LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " << DIL->getFilename() << " Line: " << DIL->getLine()); } for (unsigned It = 1; It != ULO.Count; ++It) { std::vector NewBlocks; SmallDenseMap NewLoops; NewLoops[L] = L; for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { ValueToValueMapTy VMap; BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); Header->getParent()->getBasicBlockList().push_back(New); assert((*BB != Header || LI->getLoopFor(*BB) == L) && "Header should not be in a sub-loop"); // Tell LI about New. const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops); if (OldLoop) LoopsToSimplify.insert(NewLoops[OldLoop]); if (*BB == Header) // Loop over all of the PHI nodes in the block, changing them to use // the incoming values from the previous block. for (PHINode *OrigPHI : OrigPHINode) { PHINode *NewPHI = cast(VMap[OrigPHI]); Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock); if (Instruction *InValI = dyn_cast(InVal)) if (It > 1 && L->contains(InValI)) InVal = LastValueMap[InValI]; VMap[OrigPHI] = InVal; New->getInstList().erase(NewPHI); } // Update our running map of newest clones LastValueMap[*BB] = New; for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); VI != VE; ++VI) LastValueMap[VI->first] = VI->second; // Add phi entries for newly created values to all exit blocks. for (BasicBlock *Succ : successors(*BB)) { if (L->contains(Succ)) continue; for (PHINode &PHI : Succ->phis()) { Value *Incoming = PHI.getIncomingValueForBlock(*BB); ValueToValueMapTy::iterator It = LastValueMap.find(Incoming); if (It != LastValueMap.end()) Incoming = It->second; PHI.addIncoming(Incoming, New); } } // Keep track of new headers and latches as we create them, so that // we can insert the proper branches later. if (*BB == Header) Headers.push_back(New); if (*BB == LatchBlock) Latches.push_back(New); // Keep track of the successor of the new header in the current iteration. for (auto *Pred : predecessors(*BB)) if (Pred == Header) { HeaderSucc.push_back(New); break; } NewBlocks.push_back(New); UnrolledLoopBlocks.push_back(New); // Update DomTree: since we just copy the loop body, and each copy has a // dedicated entry block (copy of the header block), this header's copy // dominates all copied blocks. That means, dominance relations in the // copied body are the same as in the original body. if (DT) { if (*BB == Header) DT->addNewBlock(New, Latches[It - 1]); else { auto BBDomNode = DT->getNode(*BB); auto BBIDom = BBDomNode->getIDom(); BasicBlock *OriginalBBIDom = BBIDom->getBlock(); DT->addNewBlock( New, cast(LastValueMap[cast(OriginalBBIDom)])); } } } // Remap all instructions in the most recent iteration for (BasicBlock *NewBlock : NewBlocks) { for (Instruction &I : *NewBlock) { ::remapInstruction(&I, LastValueMap); if (auto *II = dyn_cast(&I)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); } } } // Loop over the PHI nodes in the original block, setting incoming values. for (PHINode *PN : OrigPHINode) { if (CompletelyUnroll) { PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader)); Header->getInstList().erase(PN); } else if (ULO.Count > 1) { Value *InVal = PN->removeIncomingValue(LatchBlock, false); // If this value was defined in the loop, take the value defined by the // last iteration of the loop. if (Instruction *InValI = dyn_cast(InVal)) { if (L->contains(InValI)) InVal = LastValueMap[InVal]; } assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch"); PN->addIncoming(InVal, Latches.back()); } } auto setDest = [LoopExit, ContinueOnTrue](BasicBlock *Src, BasicBlock *Dest, ArrayRef NextBlocks, - BasicBlock *CurrentHeader, + BasicBlock *BlockInLoop, bool NeedConditional) { auto *Term = cast(Src->getTerminator()); if (NeedConditional) { // Update the conditional branch's successor for the following // iteration. Term->setSuccessor(!ContinueOnTrue, Dest); } else { // Remove phi operands at this loop exit if (Dest != LoopExit) { BasicBlock *BB = Src; for (BasicBlock *Succ : successors(BB)) { - if (Succ == CurrentHeader) + // Preserve the incoming value from BB if we are jumping to the block + // in the current loop. + if (Succ == BlockInLoop) continue; for (PHINode &Phi : Succ->phis()) Phi.removeIncomingValue(BB, false); } } // Replace the conditional branch with an unconditional one. BranchInst::Create(Dest, Term); Term->eraseFromParent(); } }; // Now that all the basic blocks for the unrolled iterations are in place, // set up the branches to connect them. if (LatchIsExiting) { // Set up latches to branch to the new header in the unrolled iterations or // the loop exit for the last latch in a fully unrolled loop. for (unsigned i = 0, e = Latches.size(); i != e; ++i) { // The branch destination. unsigned j = (i + 1) % e; BasicBlock *Dest = Headers[j]; bool NeedConditional = true; if (RuntimeTripCount && j != 0) { NeedConditional = false; } // For a complete unroll, make the last iteration end with a branch // to the exit block. if (CompletelyUnroll) { if (j == 0) Dest = LoopExit; // If using trip count upper bound to completely unroll, we need to keep // the conditional branch except the last one because the loop may exit // after any iteration. assert(NeedConditional && "NeedCondition cannot be modified by both complete " "unrolling and runtime unrolling"); NeedConditional = (ULO.PreserveCondBr && j && !(ULO.PreserveOnlyFirst && i != 0)); } else if (j != BreakoutTrip && (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0)) { // If we know the trip count or a multiple of it, we can safely use an // unconditional branch for some iterations. NeedConditional = false; } setDest(Latches[i], Dest, Headers, Headers[i], NeedConditional); } } else { // Setup headers to branch to their new successors in the unrolled // iterations. for (unsigned i = 0, e = Headers.size(); i != e; ++i) { // The branch destination. unsigned j = (i + 1) % e; BasicBlock *Dest = HeaderSucc[i]; bool NeedConditional = true; if (RuntimeTripCount && j != 0) NeedConditional = false; if (CompletelyUnroll) // We cannot drop the conditional branch for the last condition, as we // may have to execute the loop body depending on the condition. NeedConditional = j == 0 || ULO.PreserveCondBr; else if (j != BreakoutTrip && (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0)) // If we know the trip count or a multiple of it, we can safely use an // unconditional branch for some iterations. NeedConditional = false; - setDest(Headers[i], Dest, Headers, Headers[i], NeedConditional); + setDest(Headers[i], Dest, Headers, HeaderSucc[i], NeedConditional); } // Set up latches to branch to the new header in the unrolled iterations or // the loop exit for the last latch in a fully unrolled loop. for (unsigned i = 0, e = Latches.size(); i != e; ++i) { // The original branch was replicated in each unrolled iteration. BranchInst *Term = cast(Latches[i]->getTerminator()); // The branch destination. unsigned j = (i + 1) % e; BasicBlock *Dest = Headers[j]; // When completely unrolling, the last latch becomes unreachable. if (CompletelyUnroll && j == 0) new UnreachableInst(Term->getContext(), Term); else // Replace the conditional branch with an unconditional one. BranchInst::Create(Dest, Term); Term->eraseFromParent(); } } // Update dominators of blocks we might reach through exits. // Immediate dominator of such block might change, because we add more // routes which can lead to the exit: we can now reach it from the copied // iterations too. if (DT && ULO.Count > 1) { for (auto *BB : OriginalLoopBlocks) { auto *BBDomNode = DT->getNode(BB); SmallVector ChildrenToUpdate; for (auto *ChildDomNode : BBDomNode->getChildren()) { auto *ChildBB = ChildDomNode->getBlock(); if (!L->contains(ChildBB)) ChildrenToUpdate.push_back(ChildBB); } BasicBlock *NewIDom; BasicBlock *&TermBlock = LatchIsExiting ? LatchBlock : Header; auto &TermBlocks = LatchIsExiting ? Latches : Headers; if (BB == TermBlock) { // The latch is special because we emit unconditional branches in // some cases where the original loop contained a conditional branch. // Since the latch is always at the bottom of the loop, if the latch // dominated an exit before unrolling, the new dominator of that exit // must also be a latch. Specifically, the dominator is the first // latch which ends in a conditional branch, or the last latch if // there is no such latch. // For loops exiting from the header, we limit the supported loops // to have a single exiting block. NewIDom = TermBlocks.back(); for (BasicBlock *Iter : TermBlocks) { Instruction *Term = Iter->getTerminator(); if (isa(Term) && cast(Term)->isConditional()) { NewIDom = Iter; break; } } } else { // The new idom of the block will be the nearest common dominator // of all copies of the previous idom. This is equivalent to the // nearest common dominator of the previous idom and the first latch, // which dominates all copies of the previous idom. NewIDom = DT->findNearestCommonDominator(BB, LatchBlock); } for (auto *ChildBB : ChildrenToUpdate) DT->changeImmediateDominator(ChildBB, NewIDom); } } assert(!DT || !UnrollVerifyDomtree || DT->verify(DominatorTree::VerificationLevel::Fast)); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); // Merge adjacent basic blocks, if possible. for (BasicBlock *Latch : Latches) { BranchInst *Term = dyn_cast(Latch->getTerminator()); assert((Term || (CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) && "Need a branch as terminator, except when fully unrolling with " "unconditional latch"); if (Term && Term->isUnconditional()) { BasicBlock *Dest = Term->getSuccessor(0); BasicBlock *Fold = Dest->getUniquePredecessor(); if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) { // Dest has been folded into Fold. Update our worklists accordingly. std::replace(Latches.begin(), Latches.end(), Dest, Fold); UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(), UnrolledLoopBlocks.end(), Dest), UnrolledLoopBlocks.end()); } } } // At this point, the code is well formed. We now simplify the unrolled loop, // doing constant propagation and dead code elimination as we go. simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 || Peeled), LI, SE, DT, AC); NumCompletelyUnrolled += CompletelyUnroll; ++NumUnrolled; Loop *OuterL = L->getParentLoop(); // Update LoopInfo if the loop is completely removed. if (CompletelyUnroll) LI->erase(L); // After complete unrolling most of the blocks should be contained in OuterL. // However, some of them might happen to be out of OuterL (e.g. if they // precede a loop exit). In this case we might need to insert PHI nodes in // order to preserve LCSSA form. // We don't need to check this if we already know that we need to fix LCSSA // form. // TODO: For now we just recompute LCSSA for the outer loop in this case, but // it should be possible to fix it in-place. if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA) NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI); // If we have a pass and a DominatorTree we should re-simplify impacted loops // to ensure subsequent analyses can rely on this form. We want to simplify // at least one layer outside of the loop that was unrolled so that any // changes to the parent loop exposed by the unrolling are considered. if (DT) { if (OuterL) { // OuterL includes all loops for which we can break loop-simplify, so // it's sufficient to simplify only it (it'll recursively simplify inner // loops too). if (NeedToFixLCSSA) { // LCSSA must be performed on the outermost affected loop. The unrolled // loop's last loop latch is guaranteed to be in the outermost loop // after LoopInfo's been updated by LoopInfo::erase. Loop *LatchLoop = LI->getLoopFor(Latches.back()); Loop *FixLCSSALoop = OuterL; if (!FixLCSSALoop->contains(LatchLoop)) while (FixLCSSALoop->getParentLoop() != LatchLoop) FixLCSSALoop = FixLCSSALoop->getParentLoop(); formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE); } else if (PreserveLCSSA) { assert(OuterL->isLCSSAForm(*DT) && "Loops should be in LCSSA form after loop-unroll."); } // TODO: That potentially might be compile-time expensive. We should try // to fix the loop-simplified form incrementally. simplifyLoop(OuterL, DT, LI, SE, AC, nullptr, PreserveLCSSA); } else { // Simplify loops for which we might've broken loop-simplify form. for (Loop *SubLoop : LoopsToSimplify) simplifyLoop(SubLoop, DT, LI, SE, AC, nullptr, PreserveLCSSA); } } return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled : LoopUnrollResult::PartiallyUnrolled; } /// Given an llvm.loop loop id metadata node, returns the loop hint metadata /// node with the given name (for example, "llvm.loop.unroll.count"). If no /// such metadata node exists, then nullptr is returned. MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) { // First operand should refer to the loop id itself. assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) { MDNode *MD = dyn_cast(LoopID->getOperand(i)); if (!MD) continue; MDString *S = dyn_cast(MD->getOperand(0)); if (!S) continue; if (Name.equals(S->getString())) return MD; } return nullptr; }